diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index a6eacc9354d5..4b9a58337ea4 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -62,7 +62,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@883d8588e56d1753a8a58c1c86e88976f0c23449 # v3.26.3
+      uses: github/codeql-action/init@2c779ab0d087cd7fe7b826087247c2c81f27bfa6 # v3.26.5
       with:
         languages: ${{ matrix.language }}
         build-mode: ${{ matrix.build-mode }}
@@ -85,6 +85,6 @@ jobs:
           make -j 2
       
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@883d8588e56d1753a8a58c1c86e88976f0c23449 # v3.26.3
+      uses: github/codeql-action/analyze@2c779ab0d087cd7fe7b826087247c2c81f27bfa6 # v3.26.5
       with:
         category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index a4b85a7ee90c..1a77e450bd96 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -66,6 +66,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@883d8588e56d1753a8a58c1c86e88976f0c23449 # v3.26.3
+        uses: github/codeql-action/upload-sarif@2c779ab0d087cd7fe7b826087247c2c81f27bfa6 # v3.26.5
         with:
           sarif_file: results.sarif
diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini
index 1fe2a62ea164..1a76ae6fd5b5 100644
--- a/packages/framework/ini-files/config-specs.ini
+++ b/packages/framework/ini-files/config-specs.ini
@@ -297,6 +297,11 @@
 
 [COMMON]
 opt-set-cmake-var CMAKE_GENERATOR STRING : Ninja
+
+# Disable deprecated warnings until the deprecated packages are removed (e.g. Epetra),
+#  otherwise the warnings are pretty overwhelming of other compiler warnings
+opt-set-cmake-var Trilinos_SHOW_DEPRECATED_WARNINGS BOOL : OFF
+
 #opt-set-cmake-var Trilinos_ENABLE_BUILD_STATS BOOL : ON
 
 opt-set-cmake-var Trilinos_PARALLEL_LINK_JOBS_LIMIT STRING : 8
@@ -1921,7 +1926,7 @@ opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS                            STRING
 opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D          BOOL         : ON
 opt-set-cmake-var KokkosKernels_blas_serial_MPI_1_DISABLE                BOOL         : ON
 opt-set-cmake-var ROL_example_PDE-OPT_helmholtz_example_02_MPI_1_DISABLE BOOL         : ON
-opt-set-cmake-var CMAKE_CXX_FLAGS                                        STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-error -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                                        STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-error
 
 # Test failures as of 11-28-22
 opt-set-cmake-var ROL_example_PDE-OPT_navier-stokes_example_01_MPI_4_DISABLE BOOL : ON
@@ -2002,7 +2007,7 @@ use USE-DEPRECATED|YES
 use COMMON_USE-MPI|NO
 
 opt-set-cmake-var Trilinos_ENABLE_Fortran OFF    BOOL         : OFF
-opt-set-cmake-var CMAKE_CXX_FLAGS                STRING       : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-label -Werror -Werror=shadow -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                STRING       : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-label -Werror -Werror=shadow
 opt-set-cmake-var TPL_ENABLE_ParMETIS            BOOL FORCE   : OFF
 
 use GCC_PACKAGE_SPECIFIC_WARNING_FLAGS
@@ -2038,7 +2043,7 @@ opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS                   STRING       : -
 opt-set-cmake-var CMAKE_CXX_EXTENSIONS                          BOOL         : OFF
 opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D BOOL         : ON
 opt-set-cmake-var ROL_test_algorithm_TypeP_CompareTypeU_MPI_1_DISABLE BOOL         : ON
-opt-set-cmake-var CMAKE_CXX_FLAGS                               STRING       : -fno-strict-aliasing -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-inline -Wno-nonnull-compare -Wno-address -Wno-error -Werror=shadow -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                               STRING       : -fno-strict-aliasing -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-inline -Wno-nonnull-compare -Wno-address -Wno-error -Werror=shadow
 
 use GCC_OPENMP_PACKAGE_SPECIFIC_WARNING_FLAGS
 
@@ -2929,7 +2934,7 @@ opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS                            STRING
 opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D          BOOL         : ON
 opt-set-cmake-var KokkosKernels_blas_serial_MPI_1_DISABLE                BOOL         : ON
 opt-set-cmake-var ROL_example_PDE-OPT_helmholtz_example_02_MPI_1_DISABLE BOOL         : ON
-opt-set-cmake-var CMAKE_CXX_FLAGS                                        STRING       : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                                        STRING       : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline
 
 # Test failures as of 11-28-22
 opt-set-cmake-var ROL_example_PDE-OPT_navier-stokes_example_01_MPI_4_DISABLE BOOL : ON
@@ -2979,7 +2984,7 @@ opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS                            STRING
 opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D          BOOL         : ON
 opt-set-cmake-var KokkosKernels_blas_serial_MPI_1_DISABLE                BOOL         : ON
 opt-set-cmake-var ROL_example_PDE-OPT_helmholtz_example_02_MPI_1_DISABLE BOOL         : ON
-opt-set-cmake-var CMAKE_CXX_FLAGS                                        STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                                        STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline
 
 use RHEL7_POST
 
@@ -3014,7 +3019,7 @@ use PACKAGE-ENABLES|NO-PACKAGE-ENABLES
 use COMMON_SPACK_TPLS
 
 opt-set-cmake-var Trilinos_ENABLE_Fortran OFF    BOOL         : OFF
-opt-set-cmake-var CMAKE_CXX_FLAGS                STRING       : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-label -Wno-error -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                STRING       : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-label -Wno-error
 opt-set-cmake-var TPL_ENABLE_ParMETIS            BOOL FORCE   : OFF
 opt-set-cmake-var TPL_ENABLE_Pnetcdf             BOOL FORCE   : OFF
 opt-set-cmake-var TPL_Netcdf_LIBRARIES           STRING FORCE : -L${NETCDF_C_LIB|ENV};${NETCDF_C_LIB|ENV}/libnetcdf.a;${TPL_HDF5_LIBRARIES|CMAKE}
@@ -3073,7 +3078,7 @@ use COMMON_SPACK_TPLS
 opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS                   STRING       : --bind-to;none --mca btl vader,self
 opt-set-cmake-var CMAKE_CXX_EXTENSIONS                          BOOL         : OFF
 opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D BOOL         : ON
-opt-set-cmake-var CMAKE_CXX_FLAGS                               STRING       : -fno-strict-aliasing -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-inline -Wno-nonnull-compare -Wno-address -Werror -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                               STRING       : -fno-strict-aliasing -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-inline -Wno-nonnull-compare -Wno-address -Werror
 
 use RHEL7_POST
 
@@ -3108,7 +3113,7 @@ opt-set-cmake-var TPL_ENABLE_Scotch   BOOL FORCE : OFF
 opt-set-cmake-var TPL_Netcdf_LIBRARIES STRING FORCE : ${NETCDF_C_LIB|ENV}/libnetcdf.so
 
 opt-set-cmake-var Trilinos_ENABLE_Fortran OFF    BOOL         : OFF
-opt-set-cmake-var CMAKE_CXX_FLAGS STRING : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-label -Werror=parentheses -Werror=sign-compare -Werror=unused-variable -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS STRING : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-label -Werror=parentheses -Werror=sign-compare -Werror=unused-variable
 
 use GCC_PACKAGE_SPECIFIC_WARNING_FLAGS
 
@@ -3142,7 +3147,7 @@ use COMMON_SPACK_TPLS
 opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS                   STRING       : --bind-to;none --mca btl vader,self
 opt-set-cmake-var CMAKE_CXX_EXTENSIONS                          BOOL         : OFF
 opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D BOOL         : ON
-opt-set-cmake-var CMAKE_CXX_FLAGS                               STRING       : -fno-strict-aliasing -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-inline -Wno-nonnull-compare -Wno-address -Werror=sign-compare -Werror=unused-variable -Werror=parentheses -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                               STRING       : -fno-strict-aliasing -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-inline -Wno-nonnull-compare -Wno-address -Werror=sign-compare -Werror=unused-variable -Werror=parentheses
 
 # TPL_BLAS_LIBRARIES is redefined here with libm for SuperLU to properly link
 opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : -L${BLAS_ROOT|ENV}/lib;-lblas;-lgfortran;-lgomp;-lm
@@ -3185,7 +3190,7 @@ opt-set-cmake-var ROL_example_PDE-OPT_helmholtz_example_02_MPI_1_DISABLE BOOL
 opt-set-cmake-var Pliris_vector_random_MPI_3_DISABLE                     BOOL         : ON
 opt-set-cmake-var Pliris_vector_random_MPI_4_DISABLE                     BOOL         : ON
 
-opt-set-cmake-var CMAKE_CXX_FLAGS                                        STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -Werror=sign-compare -Werror=unused-variable -Werror=parentheses -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                                        STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -Werror=sign-compare -Werror=unused-variable -Werror=parentheses
 
 # Test failures as of 11-28-22
 opt-set-cmake-var ROL_example_PDE-OPT_navier-stokes_example_01_MPI_4_DISABLE BOOL : ON
@@ -3286,7 +3291,7 @@ opt-set-cmake-var TPL_LAPACK_LIBRARY_DIRS   STRING FORCE : ${OPENBLAS_ROOT|ENV}/
 opt-set-cmake-var TPL_LAPACK_LIBRARIES      STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib/libopenblas.a;-L${OPENBLAS_ROOT|ENV}/lib;-lgfortran;-lgomp;-lm
 
 opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS                                STRING       : --bind-to;none --mca btl vader,self
-opt-set-cmake-var CMAKE_CXX_FLAGS                                            STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                                            STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline
 
 opt-set-cmake-var TPL_ENABLE_SuperLUDist      BOOL FORCE : ON
 opt-set-cmake-var TPL_ENABLE_ParMETIS         BOOL FORCE : ON
@@ -3336,7 +3341,7 @@ use PACKAGE-ENABLES|NO-PACKAGE-ENABLES
 
 use COMMON_SPACK_TPLS
 
-opt-set-cmake-var CMAKE_CXX_FLAGS                STRING       : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-label -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                STRING       : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-label
 
 opt-set-cmake-var TPL_BLAS_LIBRARY_DIRS     STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib
 opt-set-cmake-var TPL_BLAS_LIBRARIES        STRING FORCE : ${OPENBLAS_ROOT|ENV}/lib/libopenblas.a;-L${OPENBLAS_ROOT|ENV}/lib;-lgfortran;-lgomp;-lm
@@ -3384,7 +3389,7 @@ opt-set-cmake-var ROL_example_PDE-OPT_helmholtz_example_02_MPI_1_DISABLE     BOO
 opt-set-cmake-var ROL_example_PDE-OPT_navier-stokes_example_01_MPI_4_DISABLE BOOL         : ON
 opt-set-cmake-var Pliris_vector_random_MPI_3_DISABLE                         BOOL         : ON
 opt-set-cmake-var Pliris_vector_random_MPI_4_DISABLE                         BOOL         : ON
-opt-set-cmake-var CMAKE_CXX_FLAGS                                            STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+opt-set-cmake-var CMAKE_CXX_FLAGS                                            STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline
 
 opt-set-cmake-var TPL_ENABLE_SuperLUDist BOOL FORCE: OFF
 
diff --git a/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp b/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp
index 099893c6facd..8fc7852e7650 100644
--- a/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp
+++ b/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp
@@ -145,8 +145,11 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureGRADGRAD(Intre
     // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
     fstIntegrateCall->start();
     FunctionSpaceTools::HGRADtransformGRAD(unorientedTransformedGradValues, jacobianInverse, basisGradValues);
+    // we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
     OrientationTools<DeviceType>::modifyBasisByOrientation(transformedGradValues, unorientedTransformedGradValues,
                                                            orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
     
     transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
     FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
diff --git a/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp b/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp
index 455be4e39471..21fb9207ef0f 100644
--- a/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp
+++ b/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp
@@ -151,8 +151,11 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureH1(Intrepid2::
     // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
     fstIntegrateCall->start();
     FunctionSpaceTools::HGRADtransformGRAD(unorientedTransformedGradValues, jacobianInverse, basisGradValues);
+    // we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
     OrientationTools<DeviceType>::modifyBasisByOrientation(transformedGradValues, unorientedTransformedGradValues,
                                                            orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
     transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
     FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
     transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each.
@@ -163,8 +166,11 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureH1(Intrepid2::
     ExecutionSpace().fence();
     
     FunctionSpaceTools::HGRADtransformVALUE(unorientedTransformedBasisValues, basisValues);
+    // we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
     OrientationTools<DeviceType>::modifyBasisByOrientation(transformedBasisValues, unorientedTransformedBasisValues,
                                                            orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
     FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues);
     bool sumInto = true; // add the (value,value) integral to the (grad,grad) that we've already integrated
     FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto);
diff --git a/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp
index 17724153fcf5..a29c80bdbb2c 100644
--- a/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp
+++ b/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp
@@ -175,8 +175,11 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureHCURL(Intrepid
     // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
     fstIntegrateCall->start();
     FunctionSpaceTools::HCURLtransformCURL(unorientedTransformedCurlValues, jacobian, jacobianDeterminant, basisCurlValues);
+    // we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
     OrientationTools<DeviceType>::modifyBasisByOrientation(transformedCurlValues, unorientedTransformedCurlValues,
                                                            orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
     transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
     FunctionSpaceTools::multiplyMeasure(transformedWeightedCurlValues, cellMeasures, transformedCurlValues);
     transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedCurlValues: one flop for each.
@@ -186,8 +189,11 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureHCURL(Intrepid
     FunctionSpaceTools::integrate(cellStiffnessSubview, transformedCurlValues, transformedWeightedCurlValues);
     
     FunctionSpaceTools::HCURLtransformVALUE(unorientedTransformedBasisValues, jacobianInverse, basisValues);
+	// we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
     OrientationTools<DeviceType>::modifyBasisByOrientation(transformedBasisValues, unorientedTransformedBasisValues,
                                                            orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
     FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues);
     bool sumInto = true; // add the (value,value) integral to the (curl,curl) that we've already integrated
     FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto);
diff --git a/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp
index 04f415c88afc..2e50d065a732 100644
--- a/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp
+++ b/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp
@@ -151,8 +151,11 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureHDIV(Intrepid2
     // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
     fstIntegrateCall->start();
     FunctionSpaceTools::HDIVtransformDIV(unorientedTransformedDivValues, jacobianDeterminant, basisDivValues);
+    // we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
     OrientationTools<DeviceType>::modifyBasisByOrientation(transformedDivValues, unorientedTransformedDivValues,
                                                            orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
     transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
     FunctionSpaceTools::multiplyMeasure(transformedWeightedDivValues, cellMeasures, transformedDivValues);
     transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedDivValues: one flop for each.
@@ -161,10 +164,12 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureHDIV(Intrepid2
     
     FunctionSpaceTools::integrate(cellStiffnessSubview, transformedDivValues, transformedWeightedDivValues);
     ExecutionSpace().fence();
-    
     FunctionSpaceTools::HDIVtransformVALUE(unorientedTransformedBasisValues, jacobian, jacobianDeterminant, basisValues);
+    // we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
     OrientationTools<DeviceType>::modifyBasisByOrientation(transformedBasisValues, unorientedTransformedBasisValues,
                                                            orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
     FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues);
     bool sumInto = true; // add the (value,value) integral to the (div,div) that we've already integrated
     FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto);
diff --git a/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp
index 723b8f236698..e4729ec5e538 100644
--- a/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp
+++ b/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp
@@ -139,8 +139,11 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureHVOL(Intrepid2
     auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
     
     FunctionSpaceTools::HVOLtransformVALUE(unorientedTransformedBasisValues, jacobianDeterminant, basisValues);
+    // we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
     OrientationTools<DeviceType>::modifyBasisByOrientation(transformedBasisValues, unorientedTransformedBasisValues,
                                                            orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
     FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues);
     bool sumInto = true; // add the (value,value) integral to the (curl,curl) that we've already integrated
     FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto);
diff --git a/packages/intrepid2/assembly-examples/StandardAssembly.hpp b/packages/intrepid2/assembly-examples/StandardAssembly.hpp
index a689306dfbf6..610918e7298d 100644
--- a/packages/intrepid2/assembly-examples/StandardAssembly.hpp
+++ b/packages/intrepid2/assembly-examples/StandardAssembly.hpp
@@ -110,10 +110,10 @@ namespace {
 }
 
 //! General assembly for two arbitrary bases and ops that uses the classic, generic Intrepid2 paths.
-template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType, unsigned long spaceDim2>  // spaceDim and spaceDim2 should agree on value (differ on type)
 Intrepid2::ScalarView<Scalar,DeviceType> performStandardAssembly(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, int worksetSize,
-                                                                 const int &polyOrder1, const Intrepid2::EFunctionSpace &fs1, const Intrepid2::EOperator &op1,
-                                                                 const int &polyOrder2, const Intrepid2::EFunctionSpace &fs2, const Intrepid2::EOperator &op2,
+                                                                 const int &polyOrder1, const Intrepid2::EFunctionSpace &fs1, const Intrepid2::EOperator &op1, Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim2> > vectorWeight1,
+                                                                 const int &polyOrder2, const Intrepid2::EFunctionSpace &fs2, const Intrepid2::EOperator &op2, Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim2> > vectorWeight2,
                                                                  double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
 {
   using ExecutionSpace = typename DeviceType::execution_space;
@@ -170,32 +170,72 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardAssembly(Intrepid2::Cell
   ViewType basis1Values = basis1->allocateOutputView(numPoints, op1); // (F1,P[,D])
   ViewType basis2Values = basis2->allocateOutputView(numPoints, op2); // (F2,P[,D])
   
-  ViewType orientedValues1, transformedValues1;
-  ViewType orientedValues2, transformedValues2, transformedWeightedValues2;
+  ViewType orientedValues1, transformedValues1, ultimateValues1;
+  ViewType orientedValues2, transformedValues2, ultimateValues2, ultimateWeightedValues2;
   
-  INTREPID2_TEST_FOR_EXCEPTION(basis1Values.rank() != basis2Values.rank(), std::invalid_argument, "basis1 and basis2 must agree on their rank under the respective operators");
+  int ultimateBasis1Rank, ultimateBasis2Rank;
+  if (basis1Values.rank() == 2)
+  {
+    // the un-transformed values have shape (F,P): scalar values
+    // if vector weights supplied, these will increase the rank
+    ultimateBasis1Rank = (vectorWeight1 == Teuchos::null) ? 3 : 4; // (C,F,P) or (C,F,P,D)
+  }
+  else if (basis1Values.rank() == 3)
+  {
+    // the un-transformed values have shape (F,P,D): vector values
+    // if vector weights supplied, these will decrease the rank (we interpret as a dot product)
+    ultimateBasis1Rank = (vectorWeight1 == Teuchos::null) ? 4 : 3; // (C,F,P,D) or (C,F,P)
+  }
+  if (basis2Values.rank() == 2)
+  {
+    // the un-transformed values have shape (F,P): scalar values
+    // if vector weights supplied, these will increase the rank
+    ultimateBasis2Rank = (vectorWeight2 == Teuchos::null) ? 3 : 4; // (C,F,P) or (C,F,P,D)
+  }
+  else if (basis2Values.rank() == 3)
+  {
+    // the un-transformed values have shape (F,P,D): vector values
+    // if vector weights supplied, these will decrease the rank (we interpret as a dot product)
+    ultimateBasis2Rank = (vectorWeight2 == Teuchos::null) ? 4 : 3; // (C,F,P,D) or (C,F,P)
+  }
   
-  const bool scalarValued = (basis1Values.rank() == 2); // (F1,P): scalar-valued
-  if (scalarValued)
+  INTREPID2_TEST_FOR_EXCEPTION(ultimateBasis1Rank != ultimateBasis2Rank, std::invalid_argument, "basis1 and basis2 must agree on their rank under the respective operators");
+  
+  if (basis1Values.rank() == 2)
   {
     orientedValues1 = ViewType("oriented values 1", worksetSize, numFields1, numPoints);
-    orientedValues2 = ViewType("oriented values 2", worksetSize, numFields2, numPoints);
-    
     transformedValues1 = ViewType("transformed values 1", worksetSize, numFields1, numPoints);
+  }
+  else
+  {
+    orientedValues1 = ViewType("oriented values 1", worksetSize, numFields1, numPoints, spaceDim);
+    transformedValues1 = ViewType("transformed values 1", worksetSize, numFields1, numPoints, spaceDim);
+  }
+  if (basis2Values.rank() == 2)
+  {
+    orientedValues2 = ViewType("oriented values 2", worksetSize, numFields2, numPoints);
     transformedValues2 = ViewType("transformed values 2", worksetSize, numFields2, numPoints);
+  }
+  else
+  {
+    orientedValues2 = ViewType("oriented values 2", worksetSize, numFields2, numPoints, spaceDim);
+    transformedValues2 = ViewType("transformed values 2", worksetSize, numFields2, numPoints, spaceDim);
+  }
+  
+  const bool scalarValued = (ultimateBasis1Rank == 3); // (C,F1,P): scalar-valued
+  if (scalarValued)
+  {
+    ultimateValues1 = ViewType("ultimate values 1", worksetSize, numFields1, numPoints);
+    ultimateValues2 = ViewType("ultimate values 2", worksetSize, numFields2, numPoints);
     
-    transformedWeightedValues2 = ViewType("transformed weighted values 2", worksetSize, numFields2, numPoints);
+    ultimateWeightedValues2 = ViewType("ultimate weighted values 2", worksetSize, numFields2, numPoints);
   }
   else // (F1, P, D)
   {
-    const int finalDim = basis1Values.extent_int(2);
-    orientedValues1 = ViewType("oriented values 1", worksetSize, numFields1, numPoints, finalDim);
-    orientedValues2 = ViewType("oriented values 2", worksetSize, numFields2, numPoints, finalDim);
-    
-    transformedValues1 = ViewType("transformed values 1", worksetSize, numFields1, numPoints, finalDim);
-    transformedValues2 = ViewType("transformed values 2", worksetSize, numFields2, numPoints, finalDim);
+    ultimateValues1 = ViewType("ultimate values 1", worksetSize, numFields1, numPoints, spaceDim);
+    ultimateValues2 = ViewType("ultimate values 2", worksetSize, numFields2, numPoints, spaceDim);
     
-    transformedWeightedValues2 = ViewType("transformed weighted values 2", worksetSize, numFields2, numPoints, finalDim);
+    ultimateWeightedValues2 = ViewType("ultimate weighted values 2", worksetSize, numFields2, numPoints, spaceDim);
   }
     
   basis1->getValues(basis1Values, cubaturePoints, op1 );
@@ -218,6 +258,10 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardAssembly(Intrepid2::Cell
   ViewType jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
   ViewType jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
   ViewType jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);
+  
+  // Views used for vector-weighted case:
+  ViewType scalarTransformedValues1        ("scalar transformed values 1", worksetSize, numFields1, numPoints);
+  ViewType scalarTransformedWeightedValues2("scalar transformed weighted values 2", worksetSize, numFields2, numPoints);
 
   initialSetupTimer->stop();
   
@@ -243,23 +287,45 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardAssembly(Intrepid2::Cell
       Kokkos::resize(jacobianInverse,     numCellsInWorkset, numPoints, spaceDim, spaceDim);
       Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints);
       Kokkos::resize(cellMeasures,        numCellsInWorkset, numPoints);
+      Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints);
       
-      if (scalarValued)
+      Kokkos::resize(scalarTransformedValues1,         numCellsInWorkset, numFields1, numPoints);
+      Kokkos::resize(scalarTransformedWeightedValues2, numCellsInWorkset, numFields2, numPoints);
+      
+      if (basis1Values.rank() == 2)
+      {
+        Kokkos::resize(orientedValues1,    numCellsInWorkset, numFields1, numPoints);
+        Kokkos::resize(transformedValues1, numCellsInWorkset, numFields1, numPoints);
+      }
+      else
+      {
+        Kokkos::resize(orientedValues1,    numCellsInWorkset, numFields1, numPoints, spaceDim);
+        Kokkos::resize(transformedValues1, numCellsInWorkset, numFields1, numPoints, spaceDim);
+      }
+      if (basis2Values.rank() == 2)
       {
-        Kokkos::resize(orientedValues1,            numCellsInWorkset, numFields1, numPoints);
-        Kokkos::resize(orientedValues2,            numCellsInWorkset, numFields2, numPoints);
-        Kokkos::resize(transformedValues1,         numCellsInWorkset, numFields1, numPoints);
-        Kokkos::resize(transformedValues2,         numCellsInWorkset, numFields2, numPoints);
-        Kokkos::resize(transformedWeightedValues2, numCellsInWorkset, numFields2, numPoints);
+        Kokkos::resize(orientedValues2,    numCellsInWorkset, numFields2, numPoints);
+        Kokkos::resize(transformedValues2, numCellsInWorkset, numFields2, numPoints);
       }
       else
       {
-        const int finalDim = basis1Values.extent_int(2);
-        Kokkos::resize(orientedValues1,            numCellsInWorkset, numFields1, numPoints, finalDim);
-        Kokkos::resize(orientedValues2,            numCellsInWorkset, numFields2, numPoints, finalDim);
-        Kokkos::resize(transformedValues1,         numCellsInWorkset, numFields1, numPoints, finalDim);
-        Kokkos::resize(transformedValues2,         numCellsInWorkset, numFields2, numPoints, finalDim);
-        Kokkos::resize(transformedWeightedValues2, numCellsInWorkset, numFields2, numPoints, finalDim);
+        Kokkos::resize(orientedValues2,    numCellsInWorkset, numFields2, numPoints, spaceDim);
+        Kokkos::resize(transformedValues2, numCellsInWorkset, numFields2, numPoints, spaceDim);
+      }
+      
+      if (scalarValued)
+      {
+        Kokkos::resize(ultimateValues1, numCellsInWorkset, numFields1, numPoints);
+        Kokkos::resize(ultimateValues2, numCellsInWorkset, numFields2, numPoints);
+        
+        Kokkos::resize(ultimateWeightedValues2, numCellsInWorkset, numFields2, numPoints);
+      }
+      else // (F1, P, D)
+      {
+        ultimateValues1 = ViewType("ultimate values 1", worksetSize, numFields1, numPoints, spaceDim);
+        ultimateValues2 = ViewType("ultimate values 2", worksetSize, numFields2, numPoints, spaceDim);
+        
+        ultimateWeightedValues2 = ViewType("ultimate weighted values 2", worksetSize, numFields2, numPoints, spaceDim);
       }
     }
     jacobianAndCellMeasureTimer->start();
@@ -271,20 +337,94 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardAssembly(Intrepid2::Cell
     ExecutionSpace().fence();
     jacobianAndCellMeasureTimer->stop();
     
-    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
-    fstIntegrateCall->start();
     OrientationTools<DeviceType>::modifyBasisByOrientation(orientedValues1, basis1Values, orientationsWorkset, basis1.get());
     OrientationTools<DeviceType>::modifyBasisByOrientation(orientedValues2, basis2Values, orientationsWorkset, basis2.get());
+    
+    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
+    fstIntegrateCall->start();
     transform(transformedValues1, orientedValues1, fs1, op1, jacobian, jacobianDeterminant, jacobianInverse);
     transform(transformedValues2, orientedValues2, fs2, op2, jacobian, jacobianDeterminant, jacobianInverse);
-    
-    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields1+numFields2) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
-    FunctionSpaceTools::multiplyMeasure(transformedWeightedValues2, cellMeasures, transformedValues2);
-    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields1+numFields2) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each.
         
     auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
     
-    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedValues1, transformedWeightedValues2);
+    if (vectorWeight1 != Teuchos::null)
+    {
+      auto uWeight = *vectorWeight1;
+      
+      auto policy3 = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields1,numPoints});
+      if (transformedValues1.rank() == 4)
+      {
+        Kokkos::parallel_for("compute ultimateValues1", policy3,
+        KOKKOS_LAMBDA (const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal)
+        {
+          Scalar u_result = 0;
+          for (int d=0; d<spaceDim; d++)
+          {
+            u_result += uWeight[d] * transformedValues1(cellOrdinal,fieldOrdinal,pointOrdinal,d);
+          }
+          ultimateValues1(cellOrdinal,fieldOrdinal,pointOrdinal) = u_result;
+        });
+      }
+      else // transformedValues1.rank() == 3
+      {
+        Kokkos::parallel_for("compute ultimateValues1", policy3,
+        KOKKOS_LAMBDA (const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal)
+        {
+          const Scalar & value1 = transformedValues1(cellOrdinal,fieldOrdinal,pointOrdinal);
+          for (int d=0; d<spaceDim; d++)
+          {
+            ultimateValues1(cellOrdinal,fieldOrdinal,pointOrdinal, d) = value1 * uWeight[d];
+          }
+        });
+      }
+    }
+    else
+    {
+      ultimateValues1 = transformedValues1;
+    }
+    
+    if (vectorWeight2 != Teuchos::null)
+    {
+      auto vWeight = *vectorWeight2;
+      
+      auto policy3 = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields2,numPoints});
+      if (transformedValues2.rank() == 4)
+      {
+        Kokkos::parallel_for("compute ultimateValues2", policy3,
+        KOKKOS_LAMBDA (const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal)
+        {
+          Scalar v_result = 0;
+          for (int d=0; d<spaceDim; d++)
+          {
+            v_result += vWeight[d] * transformedValues2(cellOrdinal,fieldOrdinal,pointOrdinal,d);
+          }
+          ultimateValues2(cellOrdinal,fieldOrdinal,pointOrdinal) = v_result;
+        });
+      }
+      else // transformedValues2.rank() == 3
+      {
+        Kokkos::parallel_for("compute ultimateValues2", policy3,
+        KOKKOS_LAMBDA (const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal)
+        {
+          const Scalar & value2 = transformedValues2(cellOrdinal,fieldOrdinal,pointOrdinal);
+          for (int d=0; d<spaceDim; d++)
+          {
+            ultimateValues2(cellOrdinal,fieldOrdinal,pointOrdinal, d) = value2 * vWeight[d];
+          }
+        });
+      }
+    }
+    else
+    {
+      ultimateValues2 = transformedValues2;
+    }
+    
+    FunctionSpaceTools::multiplyMeasure(ultimateWeightedValues2, cellMeasures, ultimateValues2);
+    transformIntegrateFlopCount += ultimateValues2.size(); // multiply each entry of ultimateValues2: one flop for each.
+    
+    FunctionSpaceTools::integrate(cellStiffnessSubview, ultimateValues1, ultimateWeightedValues2);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields1+numFields2) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
+    
     ExecutionSpace().fence();
     fstIntegrateCall->stop();
     
@@ -297,4 +437,18 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardAssembly(Intrepid2::Cell
   return cellStiffness;
 }
 
+//! General assembly for two arbitrary bases and ops that uses the classic, generic Intrepid2 paths.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStandardAssembly(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, int worksetSize,
+                                                                 const int &polyOrder1, const Intrepid2::EFunctionSpace &fs1, const Intrepid2::EOperator &op1,
+                                                                 const int &polyOrder2, const Intrepid2::EFunctionSpace &fs2, const Intrepid2::EOperator &op2,
+                                                                 double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim> > nullVectorWeight = Teuchos::null;
+  
+  return performStandardAssembly<Scalar,BasisFamily,PointScalar,spaceDim,DeviceType>(geometry, worksetSize,
+                                                                                     polyOrder1, fs1, op1, nullVectorWeight,
+                                                                                     polyOrder2, fs2, op2, nullVectorWeight,
+                                                                                     transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+}
 #endif /* StandardAssembly_hpp */
diff --git a/packages/intrepid2/assembly-examples/StructuredAssembly.hpp b/packages/intrepid2/assembly-examples/StructuredAssembly.hpp
index 24c87de7e90e..98a31da9c041 100644
--- a/packages/intrepid2/assembly-examples/StructuredAssembly.hpp
+++ b/packages/intrepid2/assembly-examples/StructuredAssembly.hpp
@@ -102,10 +102,10 @@ namespace {
 }
 
 //! General assembly for two arbitrary bases and ops that takes advantage of the new structured integration support, including support for sum factorization.
-template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType, unsigned long spaceDim2>  // spaceDim and spaceDim2 should agree in value (differ in type)
 Intrepid2::ScalarView<Scalar,DeviceType> performStructuredAssembly(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &worksetSize,
-                                                                   const int &polyOrder1, const Intrepid2::EFunctionSpace &fs1, const Intrepid2::EOperator &op1,
-                                                                   const int &polyOrder2, const Intrepid2::EFunctionSpace &fs2, const Intrepid2::EOperator &op2,
+                                                                   const int &polyOrder1, const Intrepid2::EFunctionSpace &fs1, const Intrepid2::EOperator &op1, Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim2> > vectorWeight1,
+                                                                   const int &polyOrder2, const Intrepid2::EFunctionSpace &fs2, const Intrepid2::EOperator &op2, Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim2> > vectorWeight2,
                                                                    double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
 {
   using namespace Intrepid2;
@@ -151,7 +151,7 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStructuredAssembly(Intrepid2::Ce
   
   BasisValues<Scalar,DeviceType> basis2Values = basis2->allocateBasisValues(tensorCubaturePoints, op2);
   basis2->getValues(basis2Values, tensorCubaturePoints, op2);
-      
+  
   int cellOffset = 0;
   
   auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
@@ -169,18 +169,19 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStructuredAssembly(Intrepid2::Ce
   auto transformedBasis2ValuesTemp = transform(basis2Values, fs2, op2, jacobian, jacobianDet, jacobianInv, jacobianDetInv, jacobianDividedByJacobianDet);
   auto integralData = IntegrationTools::allocateIntegralData(transformedBasis1ValuesTemp, cellMeasures, transformedBasis2ValuesTemp);
   
-  const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
+  const int numJacobianDataPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
+  const int numPoints             = jacobian.extent_int(1); // number of logical points
   
   // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
-  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
-  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
-  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numJacobianDataPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numJacobianDataPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numJacobianDataPoints);
   
   transformIntegrateFlopCount = 0;
   jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
   jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
   jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
-  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  jacobianCellMeasureFlopCount += numCells * numJacobianDataPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
   
   auto refData = geometry.getJacobianRefData(tensorCubaturePoints);
   
@@ -217,6 +218,49 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStructuredAssembly(Intrepid2::Ce
     auto transformedBasis1Values = transform(basis1Values, fs1, op1, jacobian, jacobianDet, jacobianInv, jacobianDetInv, jacobianDividedByJacobianDet);
     auto transformedBasis2Values = transform(basis2Values, fs2, op2, jacobian, jacobianDet, jacobianInv, jacobianDetInv, jacobianDividedByJacobianDet);
     
+    if (vectorWeight1 != Teuchos::null)
+    {
+      ScalarView<Scalar,DeviceType> auView("a_u", spaceDim);
+      auto auViewHost = Kokkos::create_mirror(auView);
+      for (int d=0; d<spaceDim; d++)
+      {
+        auViewHost(d) = (*vectorWeight1)[d];
+      }
+      Kokkos::deep_copy(auView, auViewHost);
+      
+      Kokkos::Array<int,3> extents {numCellsInWorkset,numPoints,spaceDim};
+      Kokkos::Array<DataVariationType,3> variationTypes {CONSTANT, CONSTANT, GENERAL};
+      
+      Data<Scalar,DeviceType> au_data(auView, extents, variationTypes);
+      auto uTransform = Data<Scalar,DeviceType>::allocateMatVecResult(transformedBasis1Values.transform(), au_data, true);
+      uTransform.storeMatVec(transformedBasis1Values.transform(), au_data, true); // true: transpose basis transform when multiplying
+      transformedBasis1Values = Intrepid2::TransformedBasisValues<double, DeviceType>(uTransform, basis1Values);
+      
+      // TODO: modify transformIntegrateFlopCount to include an estimate for above mat-vecs (but note that these will not be a dominant cost, especially at high order).
+    }
+    
+    if (vectorWeight2 != Teuchos::null)
+    {
+      ScalarView<Scalar,DeviceType> avView("a_v", spaceDim);
+      auto avViewHost = Kokkos::create_mirror(avView);
+      
+      for (int d=0; d<spaceDim; d++)
+      {
+        avViewHost(d) = (*vectorWeight2)[d];
+      }
+      Kokkos::deep_copy(avView, avViewHost);
+      
+      Kokkos::Array<int,3> extents {numCellsInWorkset,numPoints,spaceDim};
+      Kokkos::Array<DataVariationType,3> variationTypes {CONSTANT, CONSTANT, GENERAL};
+      
+      Data<Scalar,DeviceType> av_data(avView, extents, variationTypes);
+      auto vTransform = Data<Scalar,DeviceType>::allocateMatVecResult(transformedBasis2Values.transform(), av_data, true);
+      vTransform.storeMatVec(transformedBasis2Values.transform(), av_data, true); // true: transpose basis transform when multiplying
+      transformedBasis2Values = Intrepid2::TransformedBasisValues<double, DeviceType>(vTransform, basis2Values);
+      
+      // TODO: modify transformIntegrateFlopCount to include an estimate for above mat-vecs (but note that these will not be a dominant cost, especially at high order).
+    }
+    
     geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
     ExecutionSpace().fence();
     jacobianAndCellMeasureTimer->stop();
@@ -243,6 +287,22 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStructuredAssembly(Intrepid2::Ce
     cellOffset += worksetSize;
   }
   return cellStiffness;
+
+}
+
+//! General assembly for two arbitrary bases and ops that takes advantage of the new structured integration support, including support for sum factorization.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStructuredAssembly(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &worksetSize,
+                                                                   const int &polyOrder1, const Intrepid2::EFunctionSpace &fs1, const Intrepid2::EOperator &op1,
+                                                                   const int &polyOrder2, const Intrepid2::EFunctionSpace &fs2, const Intrepid2::EOperator &op2,
+                                                                   double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim> > nullVectorWeight = Teuchos::null;
+  
+  return performStructuredAssembly<Scalar,BasisFamily,PointScalar,spaceDim,DeviceType>(geometry, worksetSize,
+                                                                                       polyOrder1, fs1, op1, nullVectorWeight,
+                                                                                       polyOrder2, fs2, op2, nullVectorWeight,
+                                                                                       transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
 }
 
 #endif /* StructuredAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/VectorWeightedGRADGRADStandardAssembly.hpp b/packages/intrepid2/assembly-examples/VectorWeightedGRADGRADStandardAssembly.hpp
new file mode 100644
index 000000000000..dc540e7e65a3
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/VectorWeightedGRADGRADStandardAssembly.hpp
@@ -0,0 +1,205 @@
+//
+//  VectorWeightedGRADGRADStandardAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 5/13/24.
+//
+
+#ifndef Intrepid2_VectorWeightedGRADGRADStandardAssembly_hpp
+#define Intrepid2_VectorWeightedGRADGRADStandardAssembly_hpp
+
+#include "JacobianFlopEstimate.hpp"
+#include "Intrepid2_OrientationTools.hpp"
+
+/** \file   VectorWeightedGRADGRADStandardAssembly.hpp
+    \brief  Locally assembles a vector-weighted Poisson matrix -- an array of shape (C,F,F), with formulation (a dot grad e_i, b dot grad e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure.
+ */
+
+//! Version that uses the classic, generic Intrepid2 paths.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType, unsigned long spaceDim2 = spaceDim>
+Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureVectorWeightedGRADGRAD(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry,
+                                                                                         const int &polyOrder, int worksetSize,
+                                                                                         Teuchos::RCP<Kokkos::Array<Scalar,spaceDim2>> vectorWeight1,
+                                                                                         Teuchos::RCP<Kokkos::Array<Scalar,spaceDim2>> vectorWeight2,
+                                                                                         double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  INTREPID2_TEST_FOR_EXCEPTION(vectorWeight1 == Teuchos::null, std::invalid_argument, "vectorWeight1 cannot be null");
+  INTREPID2_TEST_FOR_EXCEPTION(vectorWeight2 == Teuchos::null, std::invalid_argument, "vectorWeight2 cannot be null");
+  
+  using ExecutionSpace = typename DeviceType::execution_space;
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  
+  using CellTools = Intrepid2::CellTools<DeviceType>;
+  using FunctionSpaceTools = Intrepid2::FunctionSpaceTools<DeviceType>;
+  
+  using namespace Intrepid2;
+  
+  using namespace std;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HGRAD;
+
+  Intrepid2::ScalarView<Intrepid2::Orientation,DeviceType> orientations("orientations", geometry.numCells() );
+  geometry.orientations(orientations, 0, -1);
+  
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+  
+  if (worksetSize > numCells) worksetSize = numCells;
+  
+  // local stiffness matrices:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  int numPoints = cubature->getNumPoints();
+  ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
+  ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);
+  
+  cubature->getCubature(cubaturePoints, cubatureWeights);
+  
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  // Allocate some intermediate containers
+  ScalarView<Scalar,DeviceType> basisValues    ("basis values", numFields, numPoints );
+  ScalarView<Scalar,DeviceType> basisGradValues("basis grad values", numFields, numPoints, spaceDim);
+
+  ScalarView<Scalar,DeviceType> unorientedTransformedGradValues("unoriented transformed grad values", worksetSize, numFields, numPoints, spaceDim);
+  ScalarView<Scalar,DeviceType> transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim);
+  ScalarView<Scalar,DeviceType> transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim);
+  ScalarView<Scalar,DeviceType> vectorWeightedTransformedGradValues("vector-weighted transformed grad values", worksetSize, numFields, numPoints);
+  ScalarView<Scalar,DeviceType> vectorWeightedTransformedWeightedGradValues("vector-weighted transformed weighted grad values", worksetSize, numFields, numPoints);
+  
+  basis->getValues(basisValues,     cubaturePoints, OPERATOR_VALUE );
+  basis->getValues(basisGradValues, cubaturePoints, OPERATOR_GRAD  );
+  
+  const int numNodesPerCell = geometry.numNodesPerCell();
+  ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0,numCells),
+  KOKKOS_LAMBDA (const int &cellOrdinal) {
+    for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
+    {
+      for (int d=0; d<spaceDim; d++)
+      {
+        expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
+      }
+    }
+  });
+  
+  ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
+  ScalarView<Scalar,DeviceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);
+
+  auto auView = getView<Scalar,DeviceType>("a_u", spaceDim);
+  auto auViewHost = Kokkos::create_mirror(auView);
+
+  for (int d=0; d<spaceDim; d++)
+  {
+    auViewHost(d) = (*vectorWeight1)[d];
+  }
+  Kokkos::deep_copy(auView, auViewHost);
+  
+  auto avView = getView<Scalar,DeviceType>("a_v", spaceDim);
+  auto avViewHost = Kokkos::create_mirror(avView);
+  for (int d=0; d<spaceDim; d++)
+  {
+    avViewHost(d) = (*vectorWeight2)[d];
+  }
+  Kokkos::deep_copy(avView, avViewHost);
+  
+  initialSetupTimer->stop();
+  
+  transformIntegrateFlopCount  = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  int cellOffset = 0;
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    
+    std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
+    auto cellWorkset         = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    auto orientationsWorkset = Kokkos::subview(orientations, cellRange);
+    
+    if (numCellsInWorkset != worksetSize)
+    {
+      Kokkos::resize(jacobian,                        numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianInverse,                 numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianDeterminant,             numCellsInWorkset, numPoints);
+      Kokkos::resize(cellMeasures,                    numCellsInWorkset, numPoints);
+      Kokkos::resize(unorientedTransformedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim);
+      Kokkos::resize(transformedGradValues,           numCellsInWorkset, numFields, numPoints, spaceDim);
+      Kokkos::resize(transformedWeightedGradValues,   numCellsInWorkset, numFields, numPoints, spaceDim);
+    }
+    jacobianAndCellMeasureTimer->start();
+    CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
+    CellTools::setJacobianInv(jacobianInverse, jacobian);
+    CellTools::setJacobianDet(jacobianDeterminant, jacobian);
+    
+    FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
+    fstIntegrateCall->start();
+    FunctionSpaceTools::HGRADtransformGRAD(unorientedTransformedGradValues, jacobianInverse, basisGradValues);
+        // we want to exclude orientation application in the core integration timing -- this time gets reported as "Other"
+    fstIntegrateCall->stop();
+    OrientationTools<DeviceType>::modifyBasisByOrientation(transformedGradValues, unorientedTransformedGradValues,
+                                                           orientationsWorkset, basis.get());
+    fstIntegrateCall->start();
+    
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each.
+        
+    auto policy3 = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numPoints});
+    Kokkos::parallel_for("compute expanded_{u,v}TransformedGradValues", policy3,
+    KOKKOS_LAMBDA (const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal)
+    {
+      Scalar u_result = 0;
+      Scalar v_result_weighted = 0;
+      for (int d=0; d<spaceDim; d++)
+      {
+        u_result          += auView(d) *         transformedGradValues(cellOrdinal,fieldOrdinal,pointOrdinal,d);
+        v_result_weighted += avView(d) * transformedWeightedGradValues(cellOrdinal,fieldOrdinal,pointOrdinal,d);
+      }
+      vectorWeightedTransformedGradValues(cellOrdinal,fieldOrdinal,pointOrdinal) = u_result;
+      vectorWeightedTransformedWeightedGradValues(cellOrdinal,fieldOrdinal,pointOrdinal) = v_result_weighted;
+    });
+    
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim * 2 * 2); // 2 * 2: one multiply, one add per (D) entry, times 2 containers u and v
+    
+    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    FunctionSpaceTools::integrate(cellStiffnessSubview, vectorWeightedTransformedGradValues, vectorWeightedTransformedWeightedGradValues);
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * 2); // 2: one multiply, one add per P entry in the contraction.
+    
+    cellOffset += worksetSize;
+  }
+//  std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
+  return cellStiffness;
+}
+
+#endif /* VectorWeightedGRADGRADStandardAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/VectorWeightedGRADGRADStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/VectorWeightedGRADGRADStructuredAssembly.hpp
new file mode 100644
index 000000000000..a1d640607720
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/VectorWeightedGRADGRADStructuredAssembly.hpp
@@ -0,0 +1,187 @@
+//
+//  VectorWeightedGRADGRADStructuredAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 5/13/24.
+//
+
+#ifndef VectorWeightedGRADGRADStructuredAssembly_h
+#define VectorWeightedGRADGRADStructuredAssembly_h
+
+#include "JacobianFlopEstimate.hpp"
+#include "Intrepid2_OrientationTools.hpp"
+
+/** \file   VectorWeightedGRADGRADStructuredAssembly.hpp
+    \brief  Locally assembles a vector-weighted Poisson matrix -- an array of shape (C,F,F), with formulation (a dot grad e_i, b dot grad e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry.
+ */
+
+//! Version that takes advantage of new structured integration support, including sum factorization.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType, unsigned long spaceDim2>
+Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadratureVectorWeightedGRADGRAD(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                                           Teuchos::RCP<Kokkos::Array<Scalar,spaceDim2>> vectorWeight1,
+                                                                                           Teuchos::RCP<Kokkos::Array<Scalar,spaceDim2>> vectorWeight2,
+                                                                             double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  using namespace std;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  using IntegrationTools   = IntegrationTools<DeviceType>;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HGRAD;
+  
+  Intrepid2::ScalarView<Intrepid2::Orientation,DeviceType> orientations("orientations", geometry.numCells() );
+  geometry.orientations(orientations, 0, -1);
+  
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+    
+  // local stiffness matrix:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  ScalarView<Scalar,DeviceType> worksetCellStiffness("cell stiffness workset matrices",worksetSize,numFields,numFields);
+
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  auto tensorCubatureWeights = cubature->allocateCubatureWeights();
+  TensorPoints<PointScalar,DeviceType> tensorCubaturePoints  = cubature->allocateCubaturePoints();
+  
+  cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);
+  
+  EOperator op = OPERATOR_GRAD;
+  BasisValues<Scalar,DeviceType> gradientValues = basis->allocateBasisValues(tensorCubaturePoints, op);
+  basis->getValues(gradientValues, tensorCubaturePoints, op);
+  
+  // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell
+    
+  int cellOffset = 0;
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  
+  Data<PointScalar,DeviceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
+  Data<PointScalar,DeviceType> jacobianDet = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  Data<PointScalar,DeviceType> jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
+  TensorData<PointScalar,DeviceType> cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights);
+  
+  // lazily-evaluated transformed gradient values (temporary to allow integralData allocation)
+  auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);
+  auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp);
+  
+  const int numJacobianDataPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
+  const int numPoints             = jacobian.extent_int(1); // logical point count
+  
+  // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numJacobianDataPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numJacobianDataPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numJacobianDataPoints);
+  
+  transformIntegrateFlopCount = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numJacobianDataPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  auto refData = geometry.getJacobianRefData(tensorCubaturePoints);
+  
+  ScalarView<Scalar,DeviceType> auView("a_u", spaceDim);
+  auto auViewHost = Kokkos::create_mirror(auView);
+  
+  for (int d=0; d<spaceDim; d++)
+  {
+    auViewHost(d) = (*vectorWeight1)[d];
+  }
+  Kokkos::deep_copy(auView, auViewHost);
+  
+  ScalarView<Scalar,DeviceType> avView("a_v", spaceDim);
+  auto avViewHost = Kokkos::create_mirror(avView);
+  
+  for (int d=0; d<spaceDim; d++)
+  {
+    avViewHost(d) = (*vectorWeight2)[d];
+  }
+  Kokkos::deep_copy(avView, avViewHost);
+  Data<Scalar,DeviceType> au_data(auView, Kokkos::Array<int,3>{worksetSize,numPoints,spaceDim}, Kokkos::Array<DataVariationType,3>{CONSTANT,CONSTANT,GENERAL});
+  Data<Scalar,DeviceType> av_data(avView, Kokkos::Array<int,3>{worksetSize,numPoints,spaceDim}, Kokkos::Array<DataVariationType,3>{CONSTANT,CONSTANT,GENERAL});
+  
+  auto uTransform = Data<Scalar,DeviceType>::allocateMatVecResult(jacobianInv, au_data, true);
+  auto vTransform = Data<Scalar,DeviceType>::allocateMatVecResult(jacobianInv, av_data, true);
+  
+  initialSetupTimer->stop();
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    int endCell           = numCellsInWorkset + startCell;
+    
+    jacobianAndCellMeasureTimer->start();
+    if (numCellsInWorkset != worksetSize)
+    {
+      const int CELL_DIM = 0; // first dimension corresponds to cell
+      jacobian.setExtent    (CELL_DIM, numCellsInWorkset);
+      jacobianDet.setExtent (CELL_DIM, numCellsInWorkset);
+      jacobianInv.setExtent (CELL_DIM, numCellsInWorkset);
+      integralData.setExtent(CELL_DIM, numCellsInWorkset);
+      au_data.setExtent     (CELL_DIM, numCellsInWorkset);
+      av_data.setExtent     (CELL_DIM, numCellsInWorkset);
+      uTransform.setExtent  (CELL_DIM, numCellsInWorkset);
+      vTransform.setExtent  (CELL_DIM, numCellsInWorkset);
+      
+      Kokkos::resize(worksetCellStiffness, numCellsInWorkset, numFields, numFields);
+      
+      // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
+      cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
+    }
+    
+    geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
+    CellTools<DeviceType>::setJacobianDet(jacobianDet, jacobian);
+    CellTools<DeviceType>::setJacobianInv(jacobianInv, jacobian);
+    
+    // lazily-evaluated transformed gradient values:
+    geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    uTransform.storeMatVec(jacobianInv, au_data, true); // true: transpose jacobianInv when multiplying
+    vTransform.storeMatVec(jacobianInv, av_data, true); // true: transpose jacobianInv when multiplying
+    
+    Intrepid2::TransformedBasisValues<double, DeviceType> uTransformedGradientValues(uTransform, gradientValues);
+    Intrepid2::TransformedBasisValues<double, DeviceType> vTransformedGradientValues(vTransform, gradientValues);
+    
+    bool sumInto = false;
+    double approximateFlopCountIntegrateWorkset = 0;
+    fstIntegrateCall->start();
+    IntegrationTools::integrate(integralData, uTransformedGradientValues, cellMeasures, vTransformedGradientValues, sumInto, &approximateFlopCountIntegrateWorkset);
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // modify integrals by orientations
+    std::pair<int,int> cellRange = {startCell, endCell};
+    auto orientationsWorkset = Kokkos::subview(orientations, cellRange);
+    OrientationTools<DeviceType>::modifyMatrixByOrientation(worksetCellStiffness, integralData.getUnderlyingView(),
+                                                            orientationsWorkset, basis.get(), basis.get());
+    
+    // copy into cellStiffness container.
+    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    Kokkos::deep_copy(cellStiffnessSubview, worksetCellStiffness);
+    
+    transformIntegrateFlopCount  += approximateFlopCountIntegrateWorkset;
+    
+    cellOffset += worksetSize;
+  }
+  return cellStiffness;
+}
+
+#endif /* VectorWeightedGRADGRADStructuredAssembly_h */
diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellData.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellData.hpp
index 863bb0b18402..a4bcad3b089a 100644
--- a/packages/intrepid2/src/Cell/Intrepid2_CellData.hpp
+++ b/packages/intrepid2/src/Cell/Intrepid2_CellData.hpp
@@ -337,10 +337,10 @@ template<unsigned CellTopologyKey>
   */
   template<>
   struct PointInclusion<shards::Line<>::key> {
-    template<typename PointViewType>
+    template<typename PointViewType, typename ScalarType>
     KOKKOS_INLINE_FUNCTION
     static bool
-    check(const PointViewType &point, const double threshold);   
+    check(const PointViewType &point, const ScalarType threshold);   
   };
   
   /** 
@@ -348,10 +348,10 @@ template<unsigned CellTopologyKey>
   */
   template<>
   struct PointInclusion<shards::Triangle<>::key> {
-    template<typename PointViewType>
+    template<typename PointViewType, typename ScalarType>
     KOKKOS_INLINE_FUNCTION
     static bool
-    check(const PointViewType &point, const double threshold);
+    check(const PointViewType &point, const ScalarType threshold);
   };
   
   /** 
@@ -360,10 +360,10 @@ template<unsigned CellTopologyKey>
   template<>
   struct PointInclusion<shards::Quadrilateral<>::key> {
 
-    template<typename PointViewType>
+    template<typename PointViewType, typename ScalarType>
     KOKKOS_INLINE_FUNCTION
     static bool
-    check(const PointViewType &point, const double threshold);
+    check(const PointViewType &point, const ScalarType threshold);
   };
     
   /** 
@@ -371,10 +371,10 @@ template<unsigned CellTopologyKey>
   */
   template<>
   struct PointInclusion<shards::Tetrahedron<>::key> {
-    template<typename PointViewType>
+    template<typename PointViewType, typename ScalarType>
     KOKKOS_INLINE_FUNCTION
     static bool
-    check(const PointViewType &point, const double threshold);
+    check(const PointViewType &point, const ScalarType threshold);
   };
 
   /** 
@@ -382,10 +382,10 @@ template<unsigned CellTopologyKey>
   */
   template<>
   struct PointInclusion<shards::Hexahedron<>::key> {
-    template<typename PointViewType>
+    template<typename PointViewType, typename ScalarType>
     KOKKOS_INLINE_FUNCTION
     static bool
-    check(const PointViewType &point, const double threshold);
+    check(const PointViewType &point, const ScalarType threshold);
   };
   
   /** 
@@ -393,10 +393,10 @@ template<unsigned CellTopologyKey>
   */
   template<>
   struct PointInclusion<shards::Pyramid<>::key> {
-    template<typename PointViewType>
+    template<typename PointViewType, typename ScalarType>
     KOKKOS_INLINE_FUNCTION
     static bool
-    check(const PointViewType &point, const double threshold);
+    check(const PointViewType &point, const ScalarType threshold);
   };
 
   /** 
@@ -404,10 +404,10 @@ template<unsigned CellTopologyKey>
   */
   template<>
   struct PointInclusion<shards::Wedge<>::key> {
-    template<typename PointViewType>
+    template<typename PointViewType, typename ScalarType>
     KOKKOS_INLINE_FUNCTION
     static bool
-    check(const PointViewType &point, const double threshold);
+    check(const PointViewType &point, const ScalarType threshold);
   };
 
 }
diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellDataDef.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellDataDef.hpp
index 1c7969c51655..6d9070dfda32 100644
--- a/packages/intrepid2/src/Cell/Intrepid2_CellDataDef.hpp
+++ b/packages/intrepid2/src/Cell/Intrepid2_CellDataDef.hpp
@@ -826,76 +826,76 @@ refCenterDataStatic_ = {
 // Point Inclusion 
 
 
-  template<typename PointViewType>
+  template<typename PointViewType, typename ScalarType>
   KOKKOS_INLINE_FUNCTION
   bool
   PointInclusion<shards::Line<>::key>::
-  check(const PointViewType &point, const double threshold) {
-    const double minus_one = -1.0 - threshold, plus_one = 1.0 + threshold;
+  check(const PointViewType &point, const ScalarType threshold) {
+    const ScalarType minus_one = -1.0 - threshold, plus_one = 1.0 + threshold;
     return (minus_one <= point(0) && point(0) <= plus_one);
   }  
 
-  template<typename PointViewType>
+  template<typename PointViewType, typename ScalarType>
   KOKKOS_INLINE_FUNCTION
   bool
   PointInclusion<shards::Triangle<>::key>::
-  check(const PointViewType &point, const double threshold) {
-    const double distance = max( max( -point(0), -point(1) ), point(0) + point(1) - 1.0 );
+  check(const PointViewType &point, const ScalarType threshold) {
+    const ScalarType distance = max( max( -point(0), -point(1) ), point(0) + point(1) - 1.0 );
     return distance < threshold;
   }
   
-  template<typename PointViewType>
+  template<typename PointViewType, typename ScalarType>
   KOKKOS_INLINE_FUNCTION
   bool
   PointInclusion<shards::Quadrilateral<>::key>::
   check(const PointViewType &point, 
-                      const double threshold) {
-    const double minus_one = -1.0 - threshold, plus_one = 1.0 + threshold;
+                      const ScalarType threshold) {
+    const ScalarType minus_one = -1.0 - threshold, plus_one = 1.0 + threshold;
     return ((minus_one <= point(0) && point(0) <= plus_one) &&
             (minus_one <= point(1) && point(1) <= plus_one));
   }  
 
-  template<typename PointViewType>
+  template<typename PointViewType, typename ScalarType>
   KOKKOS_INLINE_FUNCTION
   bool
   PointInclusion<shards::Tetrahedron<>::key>::
-  check(const PointViewType &point, const double threshold) {
-    const double distance = max( max(-point(0),-point(1)),
+  check(const PointViewType &point, const ScalarType threshold) {
+    const ScalarType distance = max( max(-point(0),-point(1)),
                                   max(-point(2), point(0) + point(1) + point(2) - 1) );
     return distance < threshold;
   }
 
-  template<typename PointViewType>
+  template<typename PointViewType, typename ScalarType>
   KOKKOS_INLINE_FUNCTION
   bool
   PointInclusion<shards::Hexahedron<>::key>::
-  check(const PointViewType &point, const double threshold) {
-    const double minus_one = -1.0 - threshold, plus_one = 1.0 + threshold;
+  check(const PointViewType &point, const ScalarType threshold) {
+    const ScalarType minus_one = -1.0 - threshold, plus_one = 1.0 + threshold;
     return ((minus_one <= point(0) && point(0) <= plus_one) &&
             (minus_one <= point(1) && point(1) <= plus_one) &&
             (minus_one <= point(2) && point(2) <= plus_one));
   }
   
-  template<typename PointViewType>
+  template<typename PointViewType, typename ScalarType>
   KOKKOS_INLINE_FUNCTION
   bool
   PointInclusion<shards::Pyramid<>::key>::
-  check(const PointViewType &point, const double threshold) {
-    const double minus_one = -1.0 - threshold, plus_one = 1.0 + threshold, minus_zero = -threshold;
-    const double left  = minus_one + point(2);
-    const double right =  plus_one - point(2);
+  check(const PointViewType &point, const ScalarType threshold) {
+    const ScalarType minus_one = -1.0 - threshold, plus_one = 1.0 + threshold, minus_zero = -threshold;
+    const ScalarType left  = minus_one + point(2);
+    const ScalarType right =  plus_one - point(2);
     return ((left       <= point(0) && point(0) <= right) &&
             (left       <= point(1) && point(1) <= right) &&
             (minus_zero <= point(2) && point(2) <= plus_one));
   }
 
-  template<typename PointViewType>
+  template<typename PointViewType, typename ScalarType>
   KOKKOS_INLINE_FUNCTION
   bool
   PointInclusion<shards::Wedge<>::key>::
-  check(const PointViewType &point, const double threshold) {
-    const double minus_one = -1.0 - threshold, plus_one = 1.0 + threshold;
-    const double distance = max( max( -point(0), -point(1) ), point(0) + point(1) - 1 );
+  check(const PointViewType &point, const ScalarType threshold) {
+    const ScalarType minus_one = -1.0 - threshold, plus_one = 1.0 + threshold;
+    const ScalarType distance = max( max( -point(0), -point(1) ), point(0) + point(1) - 1 );
     return (distance < threshold && (minus_one <= point(2) && point(2) <= plus_one));
   }
 
diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp
index 8a522a544ad3..a9eb6cab7145 100644
--- a/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp
+++ b/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp
@@ -352,11 +352,11 @@ namespace Intrepid2 {
     
     /** \brief  Computes reciprocals of determinants corresponding to the Jacobians in the Data container provided
 
-        \param  jacobianDet   [out]  - data with shape (C,P), as returned by CellTools::allocateJacobianDet()
-        \param  jacobian          [in]    - data with shape (C,P,D,D), as returned by CellGeometry::allocateJacobianData()
+        \param  jacobianDetInv   [out]  - data with shape (C,P), as returned by CellTools::allocateJacobianDet()
+        \param  jacobian                 [in]    - data with shape (C,P,D,D), as returned by CellGeometry::allocateJacobianData()
     */
     template<class PointScalar>
-    static void setJacobianDetInv( Data<PointScalar,DeviceType> & jacobianDet,
+    static void setJacobianDetInv( Data<PointScalar,DeviceType> & jacobianDetInv,
                                   const Data<PointScalar,DeviceType> & jacobian);
 
     /** \brief  Computes determinants corresponding to the Jacobians in the Data container provided
@@ -1396,11 +1396,13 @@ namespace Intrepid2 {
         \param  threshold         [in]  - "tightness" of the inclusion test
         \return true if the point is in the closure of the specified reference cell and false otherwise.
     */
-    template<typename pointViewType>
+    template<typename PointViewType>
     static bool 
-    checkPointInclusion( const pointViewType        point,
+    checkPointInclusion( const PointViewType        point,
                          const shards::CellTopology cellTopo,
-                         const double               thres = threshold() );
+                         const typename ScalarTraits<typename PointViewType::value_type>::scalar_type thres = 
+                               threshold<typename ScalarTraits<typename PointViewType::value_type>::scalar_type>() );
+
 
 
     /** \brief  Checks every point for inclusion in the reference cell of a given topology.
@@ -1417,7 +1419,8 @@ namespace Intrepid2 {
              typename InputViewType>
     static void checkPointwiseInclusion(       OutputViewType inCell, 
                                          const InputViewType points,
-                                         const double thresh = threshold()); 
+                                         const typename ScalarTraits<typename InputViewType::value_type>::scalar_type thresh =
+                                               threshold<typename ScalarTraits<typename InputViewType::value_type>::scalar_type>()); 
 
 
 
@@ -1434,7 +1437,8 @@ namespace Intrepid2 {
     static void checkPointwiseInclusion(       InCellViewType inCell,                     
                                          const PointViewType points,                       
                                          const shards::CellTopology cellTopo,                                                       
-                                         const double thres = threshold() );
+                                         const typename ScalarTraits<typename PointViewType::value_type>::scalar_type thres = 
+                                               threshold<typename ScalarTraits<typename PointViewType::value_type>::scalar_type>() );
 
     /** \brief  Checks every points for inclusion in physical cells from a cell workset.
                 The points can belong to a global set and stored in a rank-2 (P,D) view,
@@ -1454,7 +1458,8 @@ namespace Intrepid2 {
                                          const Kokkos::DynRankView<pointValueType,pointProperties...> points,                       
                                          const Kokkos::DynRankView<cellWorksetValueType,cellWorksetProperties...> cellWorkset,      
                                          const shards::CellTopology cellTopo,                                                       
-                                         const double thres = threshold() );
+                                         const typename ScalarTraits<pointValueType>::scalar_type thres = 
+                                               threshold<typename ScalarTraits<pointValueType>::scalar_type>() );
 
 
     // //============================================================================================//
diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefInclusion.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefInclusion.hpp
index 5e1b091e3638..1d9ecfe94b63 100644
--- a/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefInclusion.hpp
+++ b/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefInclusion.hpp
@@ -34,9 +34,9 @@ namespace Intrepid2 {
   template<typename PointViewType>
   bool 
   CellTools<DeviceType>::
-  checkPointInclusion( const PointViewType        point,
-                       const shards::CellTopology cellTopo,
-                       const double               threshold) {
+  checkPointInclusion( const PointViewType          point,
+                       const shards::CellTopology   cellTopo,
+                       const typename ScalarTraits<typename PointViewType::value_type>::scalar_type threshold) {
 #ifdef HAVE_INTREPID2_DEBUG
     INTREPID2_TEST_FOR_EXCEPTION( point.rank() != 1, std::invalid_argument,
                                   ">>> ERROR (Intrepid2::CellTools::checkPointInclusion): Point must have rank 1. ");
@@ -94,12 +94,13 @@ namespace Intrepid2 {
   struct checkPointInclusionFunctor {
     OutputViewType output_;
     InputViewType input_;
-    double threshold_;
+    using ScalarType = typename ScalarTraits<typename InputViewType::value_type>::scalar_type;
+    ScalarType threshold_;
 
     KOKKOS_INLINE_FUNCTION
-    checkPointInclusionFunctor(       OutputViewType output,
-                               const  InputViewType  input,
-                               const  double         threshold)
+    checkPointInclusionFunctor(       OutputViewType                      output,
+                               const  InputViewType                       input,
+                               const  ScalarType threshold)
       : output_(output), 
         input_(input),
         threshold_(threshold) {}
@@ -129,7 +130,7 @@ namespace Intrepid2 {
   void CellTools<DeviceType>::
   checkPointwiseInclusion(      OutputViewType inCell, 
                           const InputViewType  points,
-                          const double         threshold) {     
+                          const typename ScalarTraits<typename InputViewType::value_type>::scalar_type threshold) {     
 
      using FunctorType = checkPointInclusionFunctor<cellTopologyKey,decltype(inCell),decltype(points)>;
     if (points.rank() == 2) {     // inCell.rank() == 1
@@ -144,13 +145,13 @@ namespace Intrepid2 {
 
   template<typename DeviceType>
   template<typename InCellViewType,
-           typename PointViewType>
+           typename InputViewType>
   void
   CellTools<DeviceType>::
-  checkPointwiseInclusion(       InCellViewType       inCell,
-                           const PointViewType        points,
-                           const shards::CellTopology cellTopo,
-                           const double               threshold ) {
+  checkPointwiseInclusion(       InCellViewType         inCell,
+                           const InputViewType          points,
+                           const shards::CellTopology   cellTopo,
+                           const typename ScalarTraits<typename InputViewType::value_type>::scalar_type threshold ) {
 #ifdef HAVE_INTREPID2_DEBUG
     {
       INTREPID2_TEST_FOR_EXCEPTION( (inCell.rank() != 1) && (inCell.rank() != 2), std::invalid_argument,
@@ -218,7 +219,7 @@ namespace Intrepid2 {
                            const Kokkos::DynRankView<pointValueType,pointProperties...> points,
                            const Kokkos::DynRankView<cellWorksetValueType,cellWorksetProperties...> cellWorkset,
                            const shards::CellTopology cellTopo,
-                           const double threshold ) {
+                           const typename ScalarTraits<pointValueType>::scalar_type threshold ) {
 #ifdef HAVE_INTREPID2_DEBUG
     {
       const auto key = cellTopo.getBaseKey();
diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp
index 9750acf87e4d..588957c915a5 100644
--- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp
+++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp
@@ -31,18 +31,18 @@
 
 namespace Intrepid2
 {
-  template<class Scalar, typename ExecSpaceType>
+  template<class Scalar, typename DeviceType>
   class BasisValues
   {
-    using TensorDataType = TensorData<Scalar,ExecSpaceType>;
-    using VectorDataType = VectorData<Scalar,ExecSpaceType>;
+    using TensorDataType = TensorData<Scalar,DeviceType>;
+    using VectorDataType = VectorData<Scalar,DeviceType>;
     
     Kokkos::Array<TensorDataType,Parameters::MaxTensorComponents> tensorDataFamilies_;
     VectorDataType vectorData_;
     
     int numTensorDataFamilies_ = -1;
     
-    Kokkos::View<ordinal_type*,ExecSpaceType> ordinalFilter_;
+    Kokkos::View<ordinal_type*,DeviceType> ordinalFilter_;
   public:
     //! Constructor for scalar-valued BasisValues with a single family of values.
     BasisValues(TensorDataType tensorData)
@@ -76,8 +76,8 @@ namespace Intrepid2
     
     
     //! copy-like constructor for differing execution spaces.  This does a deep copy of underlying views.
-    template<typename OtherExecSpaceType, class = typename std::enable_if<!std::is_same<ExecSpaceType, OtherExecSpaceType>::value>::type>
-    BasisValues(const BasisValues<Scalar,OtherExecSpaceType> &basisValues)
+    template<typename OtherDeviceType, class = typename std::enable_if<!std::is_same<DeviceType, OtherDeviceType>::value>::type>
+    BasisValues(const BasisValues<Scalar,OtherDeviceType> &basisValues)
     :
     vectorData_(basisValues.vectorData()),
     numTensorDataFamilies_(basisValues.numTensorDataFamilies())
@@ -85,16 +85,16 @@ namespace Intrepid2
       auto otherFamilies = basisValues.tensorDataFamilies();
       for (int family=0; family<numTensorDataFamilies_; family++)
       {
-        tensorDataFamilies_[family] = TensorData<Scalar,ExecSpaceType>(otherFamilies[family]);
+        tensorDataFamilies_[family] = TensorData<Scalar,DeviceType>(otherFamilies[family]);
       }
       auto otherOrdinalFilter = basisValues.ordinalFilter();
-      ordinalFilter_ = Kokkos::View<ordinal_type*,ExecSpaceType>("BasisValues::ordinalFilter_",otherOrdinalFilter.extent(0));
+      ordinalFilter_ = Kokkos::View<ordinal_type*,DeviceType>("BasisValues::ordinalFilter_",otherOrdinalFilter.extent(0));
       
       Kokkos::deep_copy(ordinalFilter_, otherOrdinalFilter);
     }
     
     //! field start and length must align with families in vectorData_ or tensorDataFamilies_ (whichever is valid).
-    BasisValues<Scalar,ExecSpaceType> basisValuesForFields(const int &fieldStartOrdinal, const int &numFields)
+    BasisValues<Scalar,DeviceType> basisValuesForFields(const int &fieldStartOrdinal, const int &numFields)
     {
       int familyStartOrdinal = -1, familyEndOrdinal = -1;
       const int familyCount = this->numFamilies();
@@ -118,12 +118,12 @@ namespace Intrepid2
         {
           tensorDataFamilies[i-familyStartOrdinal] = tensorDataFamilies_[i];
         }
-        return BasisValues<Scalar,ExecSpaceType>(tensorDataFamilies);
+        return BasisValues<Scalar,DeviceType>(tensorDataFamilies);
       }
       else
       {
         const int componentCount = vectorData_.numComponents();
-        std::vector< std::vector<TensorData<Scalar,ExecSpaceType> > > vectorComponents(numFamiliesInFieldSpan, std::vector<TensorData<Scalar,ExecSpaceType> >(componentCount));
+        std::vector< std::vector<TensorData<Scalar,DeviceType> > > vectorComponents(numFamiliesInFieldSpan, std::vector<TensorData<Scalar,DeviceType> >(componentCount));
         for (int i=familyStartOrdinal; i<=familyEndOrdinal; i++)
         {
           for (int j=0; j<componentCount; j++)
@@ -131,7 +131,7 @@ namespace Intrepid2
             vectorComponents[i-familyStartOrdinal][j] = vectorData_.getComponent(i,j);
           }
         }
-        return BasisValues<Scalar,ExecSpaceType>(vectorComponents);
+        return BasisValues<Scalar,DeviceType>(vectorComponents);
       }
     }
     
@@ -327,16 +327,22 @@ namespace Intrepid2
       }
     }
     
-    void setOrdinalFilter(Kokkos::View<ordinal_type*,ExecSpaceType> ordinalFilter)
+    void setOrdinalFilter(Kokkos::View<ordinal_type*,DeviceType> ordinalFilter)
     {
       ordinalFilter_ = ordinalFilter;
     }
     
-    Kokkos::View<ordinal_type*,ExecSpaceType> ordinalFilter() const
+    Kokkos::View<ordinal_type*,DeviceType> ordinalFilter() const
     {
       return ordinalFilter_;
     }
   };
-}
+
+  template<class Scalar, typename DeviceType>
+  KOKKOS_INLINE_FUNCTION unsigned rank(const BasisValues<Scalar,DeviceType> &basisValues)
+  {
+    return basisValues.rank();
+  }
+} // namespace Intrepid2
 
 #endif /* Intrepid2_BasisValues_h */
diff --git a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
index 380e820c2d71..a8e57b15d5ef 100644
--- a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
+++ b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
@@ -15,6 +15,7 @@
 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
 
+#include "Intrepid2_DataTools.hpp"
 #include "Intrepid2_FunctorIterator.hpp"
 #include "Intrepid2_TensorArgumentIterator.hpp"
 
@@ -123,7 +124,7 @@ namespace Intrepid2 {
         // prepare for allocation of temporary storage
         // note: tempStorage goes "backward", starting from the final component, which needs just one entry
         
-        const bool allocateFadStorage = !std::is_pod<Scalar>::value;
+        const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);  
         if (allocateFadStorage)
         {
           fad_size_output_ = dimension_scalar(integralView_);
@@ -1063,7 +1064,7 @@ namespace Intrepid2 {
         // prepare for allocation of temporary storage
         // note: tempStorage goes "backward", starting from the final component, which needs just one entry
 
-        const bool allocateFadStorage = !std::is_pod<Scalar>::value;
+        const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
         if (allocateFadStorage)
         {
           fad_size_output_ = dimension_scalar(integralView_);
@@ -1210,7 +1211,6 @@ namespace Intrepid2 {
         const int GyEntryCount     = pointBounds_z; // for each thread: store one Gy value per z coordinate
         Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
         Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
-        Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral;  // for one Gz value that we sum into before summing into the destination matrix
         Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
         
         Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
@@ -1219,7 +1219,6 @@ namespace Intrepid2 {
         if (fad_size_output_ > 0) {
           GxIntegrals   = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(),   pointsInNonzeroComponentDimensions, fad_size_output_);
           GyIntegrals   = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(),   GyEntryCount * numThreads,          fad_size_output_);
-          GzIntegral    = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(),   numThreads,                         fad_size_output_);
           pointWeights  = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>  (teamMember.team_shmem(), composedTransform_.extent_int(1),   fad_size_output_);
           
           leftFields_x  = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(),  leftFieldBounds_x, pointBounds_x, fad_size_output_);
@@ -1232,7 +1231,6 @@ namespace Intrepid2 {
         else {
           GxIntegrals   = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(),  pointsInNonzeroComponentDimensions);
           GyIntegrals   = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(),  GyEntryCount * numThreads);
-          GzIntegral    = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(),  numThreads);
           pointWeights  = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>  (teamMember.team_shmem(),  composedTransform_.extent_int(1));
         
           leftFields_x  = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(),  leftFieldBounds_x, pointBounds_x);
@@ -1376,43 +1374,67 @@ namespace Intrepid2 {
                   const int i1 = i1j1 % leftFieldBounds_y;
                   const int j1 = i1j1 / leftFieldBounds_y;
                   
-                  int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
+                  int Gy_index_offset = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
                   
-                  int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
                   for (int lz=0; lz<pointBounds_z; lz++)
                   {
-                    Scalar & Gy = GyIntegrals(Gy_index);
-                    Gy = 0.0;
-                    
-                    for (int ly=0; ly<pointBounds_y; ly++)
+                    int pointEnumerationIndex = lz * pointBounds_y;
+                    if (fad_size_output_ == 0)
                     {
-                      const Scalar &  leftValue =  leftFields_y(i1,ly);
-                      const Scalar & rightValue = rightFields_y(j1,ly);
-                    
-                      Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
+                      Scalar Gy_local = 0;
+                      
+                      // not a Fad type; we're allow to have a vector range
+                      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_y), [&] (const int &ly, Scalar &integralThusFar)
+                      {
+                        const Scalar &  leftValue =  leftFields_y(i1,ly);
+                        const Scalar & rightValue = rightFields_y(j1,ly);
+                        
+                        integralThusFar += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
+                      }, Gy_local);
                       
-                      pointEnumerationIndex++;
+                    GyIntegrals(Gy_index_offset + lz) = Gy_local;
+                    }
+                    else
+                    {
+                      Scalar & Gy = GyIntegrals(Gy_index_offset + lz);
+                      for (int ly=0; ly<pointBounds_y; ly++)
+                      {
+                        const Scalar &  leftValue =  leftFields_y(i1,ly);
+                        const Scalar & rightValue = rightFields_y(j1,ly);
+                      
+                        Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
+                      }
                     }
-                    Gy_index++;
                   }
                       
-                  Scalar & Gz = GzIntegral(threadNumber); // one entry per thread
                   for (int i2=0; i2<leftFieldBounds_z; i2++)
                   {
                     for (int j2=0; j2<rightFieldBounds_z; j2++)
                     {
-                      Gz = 0.0;
+                      Scalar Gz = 0.0;
                       
-                      int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
+                      int Gy_index_offset = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
                       
-                      for (int lz=0; lz<pointBounds_z; lz++)
+                      if (fad_size_output_ == 0)
                       {
-                        const Scalar &  leftValue =  leftFields_z(i2,lz);
-                        const Scalar & rightValue = rightFields_z(j2,lz);
-                        
-                        Gz += leftValue * rightValue * GyIntegrals(Gy_index);
-                        
-                        Gy_index++;
+                        // not a Fad type; we're allow to have a vector range
+                        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_z), [&] (const int &lz, Scalar &integralThusFar)
+                        {
+                          const Scalar &  leftValue =  leftFields_z(i2,lz);
+                          const Scalar & rightValue = rightFields_z(j2,lz);
+                          
+                          integralThusFar += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
+                        }, Gz);
+                      }
+                      else
+                      {
+                        for (int lz=0; lz<pointBounds_z; lz++)
+                        {
+                          const Scalar &  leftValue =  leftFields_z(i2,lz);
+                          const Scalar & rightValue = rightFields_z(j2,lz);
+                          
+                          Gz += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
+                        }
                       }
                       
                       const int i =  leftFieldOrdinalOffset + i0 + (i1 + i2 *  leftFieldBounds_y) *  leftFieldBounds_x;
@@ -1421,7 +1443,9 @@ namespace Intrepid2 {
 //                      const int i = relativeEnumerationIndex( leftArguments,  leftFieldBounds, 0) +  leftFieldOrdinalOffset;
 //                      const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
                       
-                      integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
+                      Kokkos::single (Kokkos::PerThread(teamMember), [&] () {
+                        integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
+                      });
                     }
                   }
                 });
@@ -1766,7 +1790,6 @@ namespace Intrepid2 {
         {
           shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
           shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads,          fad_size_output_); // GyIntegrals: entries with x,y integrated away
-          shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(           1 * numThreads,          fad_size_output_); // GzIntegral:  entry   with x,y,z integrated away
           shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size  (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
           
           shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(  leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
@@ -1780,7 +1803,6 @@ namespace Intrepid2 {
         {
           shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions);  // GxIntegrals: entries with x integrated away
           shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads);           // GyIntegrals: entries with x,y integrated away
-          shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads);                     // GzIntegral:  entry   with x,y,z integrated away
           shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size  (composedTransform_.extent_int(1)); // pointWeights
           
           shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(  leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
@@ -1940,16 +1962,14 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
   // we require that the number of tensor components in the vectors are the same for each vector entry
   // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
   int numTensorComponentsLeft = -1;
-  const bool isVectorValued = basisValuesLeft.vectorData().isValid();
-  if (isVectorValued)
+  const bool leftIsVectorValued = basisValuesLeft.vectorData().isValid();
+  
+  if (leftIsVectorValued)
   {
-    const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
-    INTREPID2_TEST_FOR_EXCEPTION(!rightIsVectorValued, std::invalid_argument, "left and right must either both be vector-valued, or both scalar-valued");
     const auto &refVectorLeft   = basisValuesLeft.vectorData();
     int numFamiliesLeft         = refVectorLeft.numFamilies();
     int numVectorComponentsLeft = refVectorLeft.numComponents();
     Kokkos::Array<int,7> maxFieldsForComponentLeft  {0,0,0,0,0,0,0};
-    Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
     for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
     {
       for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
@@ -1969,10 +1989,24 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
         }
       }
     }
-    int numTensorComponentsRight = -1;
+  }
+  else
+  {
+    numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
+    for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
+    {
+      INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
+    }
+  }
+  int numTensorComponentsRight = -1;
+  const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
+  
+  if (rightIsVectorValued)
+  {
     const auto &refVectorRight   = basisValuesRight.vectorData();
     int numFamiliesRight         = refVectorRight.numFamilies();
     int numVectorComponentsRight = refVectorRight.numComponents();
+    Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
     for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
     {
       for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
@@ -1992,17 +2026,11 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
         }
       }
     }
-    INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument, "Left and right vector entries must have the same number of tensorial components");
+    INTREPID2_TEST_FOR_EXCEPTION(numTensorComponentsRight != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
   }
   else
   {
-    numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
-    for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
-    {
-      INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
-    }
-    
-    // check that right tensor component count also agrees
+    // check that right tensor component count agrees with left
     for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
     {
       INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
@@ -2042,11 +2070,11 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
       int a_offset = 0; // left vector component offset
       int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
       
-      const int leftVectorComponentCount = isVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
+      const int leftVectorComponentCount = leftIsVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
       for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
       {
-        TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
-                                                                     : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
+        TensorData<Scalar,DeviceType> leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
+                                                                         : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
         if (!leftComponent.isValid())
         {
           a_offset++; // empty components are understood to take up one dimension
@@ -2061,11 +2089,11 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
           int b_offset = 0; // right vector component offset
           int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
 
-          const int rightVectorComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
+          const int rightVectorComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
           for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
           {
-            TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
-                                                                          : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
+            TensorData<Scalar,DeviceType> rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
+                                                                               : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
             if (!rightComponent.isValid())
             {
               b_offset++; // empty components are understood to take up one dimension
@@ -2127,7 +2155,7 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
               {
                 ScalarView<Scalar,DeviceType> componentIntegralView;
                 
-                const bool allocateFadStorage = !std::is_pod<Scalar>::value;
+                const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
                 if (allocateFadStorage)
                 {
                   auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
@@ -2223,15 +2251,23 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
     const bool transposeRight = false;
 //    auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
 //    timer->start();
-    // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2
-    const bool matrixTransform = (leftTransform.rank() == 4) || (rightTransform.rank() == 4);
+    // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2 -- or vector weights -- (C,P,D): rank 3
     Data<Scalar,DeviceType> composedTransform;
     // invalid/empty transforms are used when the identity is intended.
+    const int leftRank  = leftTransform.rank();
+    const int rightRank = rightTransform.rank();
+    
     if (leftTransform.isValid() && rightTransform.isValid())
     {
-      if (matrixTransform)
+      const bool bothRank4 = (leftRank == 4) && (rightRank == 4);
+      const bool bothRank3 = (leftRank == 3) && (rightRank == 3);
+      const bool bothRank2 = (leftRank == 2) && (rightRank == 2);
+      const bool ranks32   = ((leftRank == 3) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 3));
+      const bool ranks42   = ((leftRank == 4) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 4));
+      
+      if (bothRank4) // (C,P,D,D)
       {
-        composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
+        composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
         composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
         
         // if the composedTransform matrices are full, the following is a good estimate.  If they have some diagonal portions, this will overcount.
@@ -2240,12 +2276,41 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
           *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
         }
       }
-      else
+      else if (bothRank3) // (C,P,D)
+      {
+        // re-cast leftTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P).
+        const int newRank   = 4;
+        auto extents        = leftTransform.getExtents();
+        auto variationTypes = leftTransform.getVariationTypes();
+        extents[3]               = extents[2];
+        extents[2]               = 1;
+        variationTypes[3]        = variationTypes[2];
+        variationTypes[2]        = CONSTANT;
+        auto leftTransformMatrix = leftTransform.shallowCopy(newRank, extents, variationTypes);
+        
+        // re-cast rightTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P)
+        extents                  = rightTransform.getExtents();
+        variationTypes           = rightTransform.getVariationTypes();
+        extents[3]               = extents[2];
+        extents[2]               = 1;
+        variationTypes[3]        = variationTypes[2];
+        variationTypes[2]        = CONSTANT;
+        auto rightTransformMatrix = rightTransform.shallowCopy(newRank, extents, variationTypes);
+        
+        composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix); // false: don't transpose
+        composedTransform.storeMatMat(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix);
+                
+        if (approximateFlops != NULL)
+        {
+          *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
+        }
+      }
+      else if (bothRank2)
       {
         composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
         composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
         
-        // re-cast composedTranform as a rank 4 (C,P,D,D) object -- a 1 x 1 matrix at each (C,P).
+        // re-cast composedTranform as a rank 4 (C,P,1,1) object -- a 1 x 1 matrix at each (C,P).
         const int newRank   = 4;
         auto extents        = composedTransform.getExtents();
         auto variationTypes = composedTransform.getVariationTypes();
@@ -2255,16 +2320,100 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
           *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
         }
       }
+      else if (ranks32) // rank 2 / rank 3 combination.
+      {
+        const auto & rank3Transform = (leftRank == 3) ? leftTransform : rightTransform;
+        const auto & rank2Transform = (leftRank == 2) ? leftTransform : rightTransform;
+        
+        composedTransform = DataTools::multiplyByCPWeights(rank3Transform, rank2Transform);
+        
+        // re-cast composedTransform as a rank 4 object:
+        // logically, the original rank-3 transform can be understood as a 1xD matrix.  The composed transform is leftTransform^T * rightTransform, so:
+        // - if left  has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
+        // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
+        const int newRank   = 4;
+        auto extents        = composedTransform.getExtents();
+        auto variationTypes = composedTransform.getVariationTypes();
+        if (leftRank == 3)
+        {
+          // extents[3] and variationTypes[3] will already be 1 and CONSTANT, respectively
+          // extents[3]               = 1;
+          // variationTypes[3]        = CONSTANT;
+        }
+        else
+        {
+          extents[3]               = extents[2];
+          extents[2]               = 1;
+          variationTypes[3]        = variationTypes[2];
+          variationTypes[2]        = CONSTANT;
+        }
+        composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
+      }
+      else if (ranks42) // rank 4 / rank 2 combination.
+      {
+        if (leftRank == 4)
+        {
+          // want to transpose left matrix, and multiply by the values from rightTransform
+          // start with the multiplication:
+          auto composedTransformTransposed = DataTools::multiplyByCPWeights(leftTransform, rightTransform);
+          composedTransform = DataTools::transposeMatrix(composedTransformTransposed);
+        }
+        else // (leftRank == 2)
+        {
+          composedTransform = DataTools::multiplyByCPWeights(rightTransform, leftTransform);
+        }
+      }
+      else
+      {
+        INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
+      }
     }
     else if (leftTransform.isValid())
     {
       // rightTransform is the identity
-      composedTransform = leftTransform;
+      switch (leftRank)
+      {
+        case 4: composedTransform = DataTools::transposeMatrix(leftTransform); break;
+        case 3:
+        {
+          // - if left  has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
+          const int newRank   = 4;
+          auto extents        = leftTransform.getExtents();
+          auto variationTypes = leftTransform.getVariationTypes();
+          
+          composedTransform = leftTransform.shallowCopy(newRank, extents, variationTypes);
+        }
+          break;
+        case 2: composedTransform = leftTransform; break;
+        default:
+          INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
+      }
     }
     else if (rightTransform.isValid())
     {
       // leftTransform is the identity
       composedTransform = rightTransform;
+      switch (rightRank)
+      {
+        case 4: composedTransform = rightTransform; break;
+        case 3:
+        {
+          // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
+          const int newRank   = 4;
+          auto extents        = rightTransform.getExtents();
+          auto variationTypes = rightTransform.getVariationTypes();
+          extents[3]          = extents[2];
+          variationTypes[3]   = variationTypes[2];
+          extents[2]          = 1;
+          variationTypes[2]   = CONSTANT;
+          
+          composedTransform = rightTransform.shallowCopy(newRank, extents, variationTypes);
+        }
+          break;
+        case 2: composedTransform = rightTransform; break;
+        default:
+          INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
+      }
     }
     else
     {
@@ -2283,8 +2432,8 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
     
     const int leftFamilyCount     = basisValuesLeft. basisValues().numFamilies();
     const int rightFamilyCount    = basisValuesRight.basisValues().numFamilies();
-    const int leftComponentCount  = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
-    const int rightComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
+    const int leftComponentCount  = leftIsVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
+    const int rightComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
     
     int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
     for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
@@ -2295,8 +2444,8 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
       bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
       for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
       {
-        TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
-                                                                     : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
+        TensorData<Scalar,DeviceType> leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
+                                                                         : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
         if (!leftComponent.isValid())
         {
            // represents zero
@@ -2313,8 +2462,8 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
           int b_offset = 0;
           for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
           {
-            TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
-                                                                          : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
+            TensorData<Scalar,DeviceType> rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
+                                                                               : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
             if (!rightComponent.isValid())
             {
                // represents zero
@@ -2416,13 +2565,13 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
                 }
               }
             }
-            b_offset += isVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
+            b_offset += rightIsVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
           }
-          rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
+          rightFieldOrdinalOffset += rightIsVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
         }
-        a_offset += isVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
+        a_offset += leftIsVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
       }
-      leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
+      leftFieldOrdinalOffset += leftIsVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
     }
   }
 //  if (approximateFlops != NULL)
diff --git a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
index 6c7db78d673d..67a713151ada 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
@@ -34,7 +34,7 @@ namespace Intrepid2 {
 \class  Intrepid2::ZeroView
 \brief  A singleton class for a DynRankView containing exactly one zero entry.  (Technically, the entry is DataScalar(), the default value for the scalar type.)  This allows View-wrapping classes to return a reference to zero, even when that zero is not explicitly stored in the wrapped views.
  
-This is used by Interpid2::Data for its getEntry() and getWritableEntry() methods.
+This is used by Intrepid2::Data for its getEntry() and getWritableEntry() methods.
  
  \note There is no protection against the zero value being overwritten; perhaps we should add some (i.e., const-qualify DataScalar).  Because of implementation details in Intrepid2::Data, we don't do so yet.
  */
@@ -1490,43 +1490,37 @@ class ZeroView {
         resultExtents[i]        = 1;
       }
       
-      ScalarView<DataScalar,DeviceType> data;
+      ScalarView<DataScalar,DeviceType> data; // new view will match this one in layout and fad dimension, if any
+      auto viewToMatch = A_MatData.getUnderlyingView();
       if (resultNumActiveDims == 1)
       {
-        auto viewToMatch = A_MatData.getUnderlyingView1(); // new view will match this one in layout and fad dimension, if any
         data = getMatchingViewWithLabel(viewToMatch, "Data mat-mat result", resultDataDims[0]);
       }
       else if (resultNumActiveDims == 2)
       {
-        auto viewToMatch = A_MatData.getUnderlyingView2(); // new view will match this one in layout and fad dimension, if any
         data = getMatchingViewWithLabel(viewToMatch, "Data mat-mat result", resultDataDims[0], resultDataDims[1]);
       }
       else if (resultNumActiveDims == 3)
       {
-        auto viewToMatch = A_MatData.getUnderlyingView3(); // new view will match this one in layout and fad dimension, if any
         data = getMatchingViewWithLabel(viewToMatch, "Data mat-mat result", resultDataDims[0], resultDataDims[1], resultDataDims[2]);
       }
       else if (resultNumActiveDims == 4)
       {
-        auto viewToMatch = A_MatData.getUnderlyingView4(); // new view will match this one in layout and fad dimension, if any
         data = getMatchingViewWithLabel(viewToMatch, "Data mat-mat result", resultDataDims[0], resultDataDims[1], resultDataDims[2],
                                         resultDataDims[3]);
       }
       else if (resultNumActiveDims == 5)
       {
-        auto viewToMatch = A_MatData.getUnderlyingView5(); // new view will match this one in layout and fad dimension, if any
         data = getMatchingViewWithLabel(viewToMatch, "Data mat-mat result", resultDataDims[0], resultDataDims[1], resultDataDims[2],
                                         resultDataDims[3], resultDataDims[4]);
       }
       else if (resultNumActiveDims == 6)
       {
-        auto viewToMatch = A_MatData.getUnderlyingView6(); // new view will match this one in layout and fad dimension, if any
         data = getMatchingViewWithLabel(viewToMatch, "Data mat-mat result", resultDataDims[0], resultDataDims[1], resultDataDims[2],
                                         resultDataDims[3], resultDataDims[4], resultDataDims[5]);
       }
       else // resultNumActiveDims == 7
       {
-        auto viewToMatch = A_MatData.getUnderlyingView7(); // new view will match this one in layout and fad dimension, if any
         data = getMatchingViewWithLabel(viewToMatch, "Data mat-mat result", resultDataDims[0], resultDataDims[1], resultDataDims[2],
                                         resultDataDims[3], resultDataDims[4], resultDataDims[5], resultDataDims[6]);
       }
@@ -1534,6 +1528,37 @@ class ZeroView {
       return Data<DataScalar,DeviceType>(data,resultRank,resultExtents,resultVariationTypes,resultBlockPlusDiagonalLastNonDiagonal);
     }
     
+    //! Constructs a container suitable for storing the result of a contraction over the final dimensions of the two provided containers.  The two containers must have the same logical shape.
+    //! \see storeInPlaceCombination()
+    //! \param A  [in] - the first data container.
+    //! \param B  [in] - the second data container.  Must have the same logical shape as A.
+    //! \param numContractionDims [in] - the number of dimensions over which the contraction should take place.
+    //! \return A numContractionDims-rank-lower container with the same logical shape as A and B in all but the last dimensions.
+    static Data<DataScalar,DeviceType> allocateContractionResult( const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B, const int &numContractionDims )
+    {
+      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(A.rank() != B.rank(), std::invalid_argument, "A and B must have the same logical shape");
+      const int rank = A.rank();
+      const int resultRank = rank - numContractionDims;
+      std::vector<DimensionInfo> dimInfo(resultRank);
+      for (int d=0; d<resultRank; d++)
+      {
+        INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(A.extent_int(d) != B.extent_int(d), std::invalid_argument, "A and B must have the same logical shape");
+        dimInfo[d] = A.combinedDataDimensionInfo(B, d);
+      }
+      Data<DataScalar,DeviceType> result(dimInfo);
+      return result;
+    }
+    
+    //! Constructs a container suitable for storing the result of a contraction over the final dimension of the two provided containers.  The two containers must have the same logical shape.
+    //! \see storeInPlaceCombination()
+    //! \param A  [in] - the first data container.
+    //! \param B  [in] - the second data container.  Must have the same logical shape as A.
+    //! \return A 1-rank-lower container with the same logical shape as A and B in all but the last dimension.
+    static Data<DataScalar,DeviceType> allocateDotProductResult( const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B )
+    {
+      return allocateContractionResult(A, B, 1);
+    }
+    
     //! Constructs a container suitable for storing the result of a matrix-vector multiply corresponding to the two provided containers.
     //! \see storeMatVec()
     static Data<DataScalar,DeviceType> allocateMatVecResult( const Data<DataScalar,DeviceType> &matData, const Data<DataScalar,DeviceType> &vecData, const bool transposeMatrix = false )
@@ -1618,10 +1643,8 @@ class ZeroView {
       }
       // for the final dimension, the variation type is always GENERAL
       // (Some combinations, e.g. CONSTANT/CONSTANT *would* generate a CONSTANT result, but constant matrices don't make a lot of sense beyond 1x1 matrices…)
-      resultVariationTypes[resultNumActiveDims] = GENERAL;
       resultActiveDims[resultNumActiveDims]     = resultRank - 1;
       resultDataDims[resultNumActiveDims]       = rows;
-      resultExtents[resultRank-1]               = rows;
       resultNumActiveDims++;
       
       for (int i=resultRank; i<7; i++)
@@ -1629,6 +1652,8 @@ class ZeroView {
         resultVariationTypes[i] = CONSTANT;
         resultExtents[i]        = 1;
       }
+      resultVariationTypes[resultRank-1] = GENERAL;
+      resultExtents[resultRank-1]        = rows;
       
       ScalarView<DataScalar,DeviceType> data;
       if (resultNumActiveDims == 1)
@@ -1730,6 +1755,64 @@ class ZeroView {
       }
     }
     
+    //! Places the result of a contraction along the final dimension of A and B into this data container.
+    void storeDotProduct(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B)
+    {
+      const int D_DIM = A.rank() - 1;
+      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(A.extent_int(D_DIM) != B.extent_int(D_DIM), std::invalid_argument, "A and B have different extents");
+      const int vectorComponents = A.extent_int(D_DIM);
+      
+      // shallow copy of this to avoid implicit references to this in call to getWritableEntry() below
+      Data<DataScalar,DeviceType> thisData = *this;
+      
+      using ExecutionSpace = typename DeviceType::execution_space;
+      // note the use of getDataExtent() below: we only range over the possibly-distinct entries
+      if (rank_ == 1) // contraction result rank; e.g., (P)
+      {
+        Kokkos::parallel_for("compute dot product", getDataExtent(0),
+        KOKKOS_LAMBDA (const int &pointOrdinal) {
+          auto & val = thisData.getWritableEntry(pointOrdinal);
+          val = 0;
+          for (int i=0; i<vectorComponents; i++)
+          {
+            val += A(pointOrdinal,i) * B(pointOrdinal,i);
+          }
+        });
+      }
+      else if (rank_ == 2) // contraction result rank; e.g., (C,P)
+      {
+        // typical case for e.g. gradient data: (C,P,D)
+        auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{getDataExtent(0),getDataExtent(1)});
+        Kokkos::parallel_for("compute dot product", policy,
+        KOKKOS_LAMBDA (const int &cellOrdinal, const int &pointOrdinal) {
+          auto & val = thisData.getWritableEntry(cellOrdinal, pointOrdinal);
+          val = 0;
+          for (int i=0; i<vectorComponents; i++)
+          {
+            val += A(cellOrdinal,pointOrdinal,i) * B(cellOrdinal,pointOrdinal,i);
+          }
+        });
+      }
+      else if (rank_ == 3)
+      {
+        auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{getDataExtent(0),getDataExtent(1),getDataExtent(2)});
+        Kokkos::parallel_for("compute dot product", policy,
+        KOKKOS_LAMBDA (const int &cellOrdinal, const int &pointOrdinal, const int &d) {
+          auto & val = thisData.getWritableEntry(cellOrdinal, pointOrdinal,d);
+          val = 0;
+          for (int i=0; i<vectorComponents; i++)
+          {
+            val += A(cellOrdinal,pointOrdinal,d,i) * B(cellOrdinal,pointOrdinal,d,i);
+          }
+        });
+      }
+      else
+      {
+        // TODO: handle other cases
+        INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "rank not yet supported");
+      }
+    }
+    
     //! Places the result of an in-place combination (e.g., entrywise sum) into this data container.
     template<class BinaryOperator>
     void storeInPlaceCombination(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B, BinaryOperator binaryOperator);
@@ -1909,7 +1992,7 @@ class ZeroView {
         {
           Kokkos::parallel_for("compute mat-mat", policy,
           KOKKOS_LAMBDA (const int &cellOrdinal, const int &pointOrdinal) {
-            for (int i=0; i<leftCols; i++)
+            for (int i=0; i<leftRows; i++)
             {
               for (int j=0; j<rightCols; j++)
               {
diff --git a/packages/intrepid2/src/Shared/Intrepid2_DataTools.hpp b/packages/intrepid2/src/Shared/Intrepid2_DataTools.hpp
index 24b687ee3ced..50cc269455ad 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_DataTools.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_DataTools.hpp
@@ -23,48 +23,81 @@ class DataTools
 {
 public:
   //! Fills Data object of logical shape (C,P,D,D) corresponding to the pointwise product of an object of shape (C,P,D,D) with one of shape (C,P).
-  //! Will also work for any "matrix" data and scalar data which differ in rank by 2, but otherwise share the same shape.  E.g., (C,F,P,D1,D2) matrices could be multiplied by (C,F,P) scalars.
+  //! Will also work for any "matrix" data and scalar data where the "matrix" has equal or higher rank than the scalar, but otherwise share the same shape.  E.g., (C,F,P,D1,D2) matrices could be multiplied by (C,F,P) scalars, or (C,P,D) vectors by (C,P) scalars.
   //! \param resultMatrixData [out] -  the resulting (C,P,D,D) container.  Must be allocated appropriately to store the resulting data; see the implementation of the two-argument multiplyByCPWeights(), which performs this allocation for you.
   //! \param matrixDataIn [in] - the input (C,P,D,D) container.
   //! \param scalarDataIn [in] - the input (C,P) container.
   template<class Scalar, class DeviceType>
   static void multiplyByCPWeights(Data<Scalar,DeviceType> &resultMatrixData, const Data<Scalar,DeviceType> &matrixDataIn, const Data<Scalar,DeviceType> &scalarDataIn)
   {
-    const ordinal_type rank      = scalarDataIn.rank();
-    auto extents                 = scalarDataIn.getExtents();
-    auto variationTypes          = scalarDataIn.getVariationTypes();
-    extents[rank]                = matrixDataIn.extent_int(rank);
-    extents[rank+1]              = matrixDataIn.extent_int(rank+1);
-    variationTypes[rank]         = CONSTANT;
-    variationTypes[rank+1]       = CONSTANT;
+    const ordinal_type rank       = scalarDataIn.rank();
+    const ordinal_type matrixRank = matrixDataIn.rank();
+    auto extents                  = scalarDataIn.getExtents();
+    auto variationTypes           = scalarDataIn.getVariationTypes();
+    for (int r=rank; r<matrixRank; r++)
+    {
+      extents[r]        = matrixDataIn.extent_int(r);
+      variationTypes[r] = CONSTANT;
+    }
     
-    auto scalarDataInExtended = scalarDataIn.shallowCopy(rank + 2, extents, variationTypes);
+    auto scalarDataInExtended = scalarDataIn.shallowCopy(matrixRank, extents, variationTypes);
     resultMatrixData.storeInPlaceProduct(matrixDataIn,scalarDataInExtended);
   }
   
   //! Allocates and fills Data object of logical shape (C,P,D,D) corresponding to the pointwise product of an object of shape (C,P,D,D) with one of shape (C,P).
-  //! Will also work for any "matrix" data and scalar data which differ in rank by 2, but otherwise share the same shape.  E.g., (C,F,P,D1,D2) matrices could be multiplied by (C,F,P) scalars.
+  //! Will also work for any "matrix" data and scalar data where the "matrix" has equal or higher rank than the scalar, but otherwise share the same shape.  E.g., (C,F,P,D1,D2) matrices could be multiplied by (C,F,P) scalars, or (C,P,D) vectors by (C,P) scalars.
   //! \param matrixDataIn [in] - the (C,P,D,D) container.
   //! \param scalarDataIn [in] - the (C,P) container.
   //! \return
   template<class Scalar, class DeviceType>
   static Data<Scalar,DeviceType> multiplyByCPWeights(const Data<Scalar,DeviceType> &matrixDataIn, const Data<Scalar,DeviceType> &scalarDataIn)
   {
-    const ordinal_type rank      = scalarDataIn.rank();
-    auto extents                 = scalarDataIn.getExtents();
-    auto variationTypes          = scalarDataIn.getVariationTypes();
-    extents[rank]                = matrixDataIn.extent_int(rank);
-    extents[rank+1]              = matrixDataIn.extent_int(rank+1);
-    variationTypes[rank]         = CONSTANT;
-    variationTypes[rank+1]       = CONSTANT;
+    const ordinal_type rank       = scalarDataIn.rank();
+    const ordinal_type matrixRank = matrixDataIn.rank();
+    auto extents                  = scalarDataIn.getExtents();
+    auto variationTypes           = scalarDataIn.getVariationTypes();
+    for (int r=rank; r<matrixRank; r++)
+    {
+      extents[r]        = matrixDataIn.extent_int(r);
+      variationTypes[r] = CONSTANT;
+    }
     
-    auto scalarDataInExtended = scalarDataIn.shallowCopy(rank + 2, extents, variationTypes);
+    auto scalarDataInExtended = scalarDataIn.shallowCopy(matrixRank, extents, variationTypes);
     
     auto result = Data<Scalar,DeviceType>::allocateInPlaceCombinationResult(scalarDataInExtended, matrixDataIn);
     
     result.storeInPlaceProduct(matrixDataIn,scalarDataInExtended);
     return result;
   }
+  
+  //! Allocates and fills Data object corresponding to the transpose of matrix data, represented by the last two dimensions of the input object.
+  //! \param matrixDataIn [in] - the (…,D1,D2) container.
+  //! \return a (…,D2,D1) container containing the transpose of the input matrix data.
+  template<class Scalar, class DeviceType>
+  static Data<Scalar,DeviceType> transposeMatrix(const Data<Scalar,DeviceType> &matrixDataIn)
+  {
+    // A direct construction of the transpose could be more efficient, but here we take advantage of existing
+    // implementations within the Data class supporting matrix-matrix multiplication.  We construct an identity
+    // matrix, and left-multiply this by the transpose of the input matrix.
+    const ordinal_type rank = matrixDataIn.rank();
+    auto extents            = matrixDataIn.getExtents();
+    auto variationTypes     = matrixDataIn.getVariationTypes();
+    const auto D1           = extents[rank-2];
+    
+    extents[rank-2]        = D1;
+    extents[rank-1]        = D1;
+    variationTypes[rank-2] = BLOCK_PLUS_DIAGONAL;
+    variationTypes[rank-1] = BLOCK_PLUS_DIAGONAL;
+    
+    Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::DataTools::transposeMatrix() - identity view",D1);
+    Kokkos::deep_copy(identityUnderlyingView, 1.0);
+    Data<Scalar,DeviceType> identityData(identityUnderlyingView,extents,variationTypes);
+    
+    auto result = Data<Scalar,DeviceType>::allocateMatMatResult(true, matrixDataIn, false, identityData);
+    result.storeMatMat(true, matrixDataIn, false, identityData);
+    
+    return result;
+  }
 };
 }
 
diff --git a/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp b/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp
index 6423eb68d80f..6e56356d86fe 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp
@@ -203,7 +203,7 @@ namespace Intrepid2
   template<typename ValueType, typename DeviceType, class ... DimArgs>
   inline ViewType<ValueType,DeviceType> getView(const std::string &label, DimArgs... dims)
   {
-    const bool allocateFadStorage = !std::is_pod<ValueType>::value;
+    const bool allocateFadStorage = !(std::is_standard_layout<ValueType>::value && std::is_trivial<ValueType>::value);
     if (!allocateFadStorage)
     {
       return ViewType<ValueType,DeviceType>(label,dims...);
@@ -218,7 +218,7 @@ namespace Intrepid2
   template<typename ValueType, class ... DimArgs>
   inline FixedRankViewType< typename RankExpander<ValueType, sizeof...(DimArgs) >::value_type, DefaultTestDeviceType > getFixedRankView(const std::string &label, DimArgs... dims)
   {
-    const bool allocateFadStorage = !std::is_pod<ValueType>::value;
+    const bool allocateFadStorage = !(std::is_standard_layout<ValueType>::value && std::is_trivial<ValueType>::value);
     using value_type = typename RankExpander<ValueType, sizeof...(dims) >::value_type;
     if (!allocateFadStorage)
     {
diff --git a/packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp b/packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp
index bc6250fed912..b177617fd448 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp
@@ -27,7 +27,7 @@ namespace Intrepid2 {
 /** \class Intrepid2::TransformedBasisValues
     \brief Structure-preserving representation of transformed vector data; reference space values and transformations are stored separately.
  
- TransformedBasisValues provides a View-like interface of rank 4, with shape (C,F,P,D).  When the corresponding accessor is used, the transformed value is determined from corresponding reference space values and the transformation.
+ TransformedBasisValues provides a View-like interface of rank 3 or 4, with shape (C,F,P) or (C,F,P,D).  When the corresponding accessor is used, the transformed value is determined from corresponding reference space values and the transformation.
 */
   template<class Scalar, typename DeviceType>
   class TransformedBasisValues
@@ -35,13 +35,13 @@ namespace Intrepid2 {
   public:
     ordinal_type numCells_;
     
-    Data<Scalar,DeviceType> transform_; // vector case: (C,P,D,D) jacobian or jacobian inverse; can also be unset for identity transform.  Scalar case: (C,P), or unset for identity.
+    Data<Scalar,DeviceType> transform_; // vector case: (C,P,D,D) jacobian or jacobian inverse; can also be unset for identity transform.  Scalar case: (C,P), or unset for identity.  Contracted vector case: (C,P,D) transform, to be contracted with a vector field to produce a scalar result.
     
     BasisValues<Scalar, DeviceType> basisValues_;
     
     /**
      \brief Standard constructor.
-     \param [in] transform - the transformation (matrix), with logical shape (C,P) or (C,P,D,D)
+     \param [in] transform - the transformation (matrix), with logical shape (C,P), (C,P,D), or (C,P,D,D)
      \param [in] basisValues - the reference-space data to be transformed, with logical shape (F,P) (for scalar values) or (F,P,D) (for vector values)
     */
     TransformedBasisValues(const Data<Scalar,DeviceType> &transform, const BasisValues<Scalar,DeviceType> &basisValues)
@@ -52,6 +52,7 @@ namespace Intrepid2 {
     {
       // sanity check: when transform is diagonal, we expect there to be no pointwise variation.
       INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(transform_.isDiagonal() && (transform_.getVariationTypes()[1] != CONSTANT), std::invalid_argument, "When transform is diagonal, we assume in various places that there is no pointwise variation; the transform_ Data should have CONSTANT as its variation type in dimension 1.");
+      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE((transform_.rank() < 2) || (transform_.rank() > 4), std::invalid_argument, "Only transforms of rank 2, 3, or 4 are supported");
     }
     
     /**
@@ -129,7 +130,7 @@ namespace Intrepid2 {
       }
       else
       {
-        if (transform_.rank() == 4)
+        if ((transform_.rank() == 4) || (transform_.rank() == 3))
         {
           transform_ = DataTools::multiplyByCPWeights(transform_,weightData);
         }
@@ -164,7 +165,22 @@ namespace Intrepid2 {
     //! Returns the logical extent in the space dimension, which is the 3 dimension in this container.
     KOKKOS_INLINE_FUNCTION int spaceDim() const
     {
-      return basisValues_.extent_int(2);
+      if ((transform_.rank() == 3) && (basisValues_.rank() == 3)) // (C,P,D) contracted in D against (F,P,D)
+      {
+        return 1; // spaceDim contracted away
+      }
+      else if ((transform_.rank() == 3) && (basisValues_.rank() == 2)) // (C,P,D) weighting (F,P)
+      {
+        return transform_.extent_int(2);
+      }
+      else if (transform_.isValid())
+      {
+        return transform_.extent_int(2);
+      }
+      else
+      {
+        return basisValues_.extent_int(2);
+      }
     }
     
     //! Scalar accessor, with arguments (C,F,P).
@@ -175,10 +191,20 @@ namespace Intrepid2 {
         // null transform is understood as the identity
         return basisValues_(fieldOrdinal,pointOrdinal);
       }
-      else
+      else if (transform_.rank() == 2)
       {
         return transform_(cellOrdinal,pointOrdinal) * basisValues_(fieldOrdinal,pointOrdinal);
       }
+      else if (transform_.rank() == 3)
+      {
+        Scalar value = 0;
+        for (int d=0; d<transform_.extent_int(2); d++)
+        {
+          value += transform_(cellOrdinal,pointOrdinal,d) * basisValues_(fieldOrdinal,pointOrdinal,d);
+        }
+        return value;
+      }
+      return 0; // should not be reachable
     }
     
     //! Vector accessor, with arguments (C,F,P,D).
@@ -193,7 +219,7 @@ namespace Intrepid2 {
       {
         return transform_(cellOrdinal,pointOrdinal,dim,dim) * basisValues_(fieldOrdinal,pointOrdinal,dim);
       }
-      else
+      else if (transform_.rank() == 4)
       {
         Scalar value = 0.0;
         for (int d2=0; d2<transform_.extent_int(2); d2++)
@@ -202,6 +228,16 @@ namespace Intrepid2 {
         }
         return value;
       }
+      else if (transform_.rank() == 3)
+      {
+        Scalar value = transform_(cellOrdinal,pointOrdinal,dim) * basisValues_(fieldOrdinal,pointOrdinal);
+        return value;
+      }
+      else // rank 2 transform
+      {
+        Scalar value = transform_(cellOrdinal,pointOrdinal) * basisValues_(fieldOrdinal,pointOrdinal,dim);
+        return value;
+      }
     }
     
     //! Returns the specified entry in the (scalar) transform.  (Only valid for scalar-valued BasisValues; see the four-argument transformWeight() for the vector-valued case.)
@@ -218,6 +254,19 @@ namespace Intrepid2 {
       }
     }
     
+    //! Returns the specified entry in the transformation vector.
+    KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal, const int &d) const
+    {
+      if (!transform_.isValid())
+      {
+        INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "three-argument transformWeight() is not supported for invalid transform_ object -- no meaningful interpretation for vector-valued identity");
+      }
+      else
+      {
+        return transform_(cellOrdinal,pointOrdinal,d);
+      }
+    }
+    
     //! Returns the specified entry in the transform matrix.
     KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal, const int &dim1, const int &dim2) const
     {
@@ -248,7 +297,27 @@ namespace Intrepid2 {
     KOKKOS_INLINE_FUNCTION
     unsigned rank() const
     {
-      return basisValues_.rank() + 1; // transformation adds a cell dimension
+      if ((transform_.rank() == 4) && (basisValues_.rank() == 3))
+      {
+        return 4; // (C,F,P,D)
+      }
+      else if (transform_.rank() == 2)
+      {
+        return basisValues_.rank() + 1; // transformation adds a cell dimension
+      }
+      else if (transform_.rank() == 3)
+      {
+        if (basisValues_.rank() == 3)
+        {
+          // transform contracts with basisValues in D dimension
+          return 3; // (C,F,P)
+        }
+        else if (basisValues_.rank() == 2) // (F,P)
+        {
+          return 4; // (C,F,P,D)
+        }
+      }
+      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "Unhandled basisValues_/transform_ rank combination");
     }
     
     //! Returns the extent in the specified dimension as an int.
diff --git a/packages/intrepid2/src/Shared/Intrepid2_Types.hpp b/packages/intrepid2/src/Shared/Intrepid2_Types.hpp
index c64ba7bf4238..8afcc665653d 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_Types.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_Types.hpp
@@ -65,14 +65,26 @@ namespace Intrepid2 {
     return epsilon<double>();
   }
 
+  template<typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION
+  ValueType tolerence() {
+    return 100.0*epsilon<ValueType>();
+  }
+
   KOKKOS_FORCEINLINE_FUNCTION
   double tolerence() {
-    return 100.0*epsilon();
+    return tolerence<double>();
+  }
+
+  template<typename ValueType>
+  KOKKOS_FORCEINLINE_FUNCTION
+  ValueType threshold() {
+    return 10.0*epsilon<ValueType>();
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
   double threshold() {
-    return 10.0*epsilon();
+    return threshold<double>();
   }
 
   /// Define constants
diff --git a/packages/intrepid2/src/Shared/Intrepid2_Utils.hpp b/packages/intrepid2/src/Shared/Intrepid2_Utils.hpp
index 14ad8483558a..45c5f09816d1 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_Utils.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_Utils.hpp
@@ -281,13 +281,13 @@ namespace Intrepid2 {
   template<typename T>
   KOKKOS_FORCEINLINE_FUNCTION
   constexpr typename
-  std::enable_if< !std::is_pod<T>::value, typename ScalarTraits<T>::scalar_type >::type
+  std::enable_if< !(std::is_standard_layout<T>::value && std::is_trivial<T>::value), typename ScalarTraits<T>::scalar_type >::type
   get_scalar_value(const T& obj) {return obj.val();}
 
   template<typename T>
   KOKKOS_FORCEINLINE_FUNCTION
   constexpr typename
-  std::enable_if< std::is_pod<T>::value, typename ScalarTraits<T>::scalar_type >::type
+  std::enable_if< std::is_standard_layout<T>::value && std::is_trivial<T>::value, typename ScalarTraits<T>::scalar_type >::type
   get_scalar_value(const T& obj){return obj;}
 
 
@@ -300,13 +300,13 @@ namespace Intrepid2 {
   template<typename T, typename ...P>
   KOKKOS_INLINE_FUNCTION
   constexpr typename
-  std::enable_if< std::is_pod<T>::value, unsigned >::type
+  std::enable_if< std::is_standard_layout<T>::value && std::is_trivial<T>::value, unsigned >::type
   dimension_scalar(const Kokkos::DynRankView<T, P...> /* view */) {return 1;}
 
   template<typename T, typename ...P>
   KOKKOS_INLINE_FUNCTION
   constexpr typename
-  std::enable_if< std::is_pod< typename Kokkos::View<T, P...>::value_type >::value, unsigned >::type
+  std::enable_if< std::is_standard_layout<typename Kokkos::View<T, P...>::value_type>::value && std::is_trivial<typename Kokkos::View<T, P...>::value_type>::value, unsigned >::type
   dimension_scalar(const Kokkos::View<T, P...> /*view*/) {return 1;}
 
   template<typename T, typename ...P>
@@ -339,7 +339,7 @@ namespace Intrepid2 {
     using DeviceType         = typename ViewType::device_type;
     using ViewTypeWithLayout = Kokkos::DynRankView<ValueType, ResultLayout, DeviceType >;
     
-    const bool allocateFadStorage = !std::is_pod<ValueType>::value;
+    const bool allocateFadStorage = !(std::is_standard_layout<ValueType>::value && std::is_trivial<ValueType>::value);
     if (!allocateFadStorage)
     {
       return ViewTypeWithLayout(label,dims...);
@@ -766,7 +766,7 @@ namespace Intrepid2 {
   template <typename ValueType>
   struct NaturalLayoutForType {
     using layout  =
-    typename std::conditional<std::is_pod<ValueType>::value,
+    typename std::conditional<(std::is_standard_layout<ValueType>::value && std::is_trivial<ValueType>::value),
       Kokkos::LayoutLeft, // for POD types, use LayoutLeft
       Kokkos::LayoutNatural<Kokkos::LayoutLeft> >::type; // For FAD types, use LayoutNatural
   };
@@ -791,7 +791,7 @@ namespace Intrepid2 {
   template<typename Scalar>
   constexpr int getVectorSizeForHierarchicalParallelism()
   {
-    return std::is_pod<Scalar>::value ? VECTOR_SIZE : FAD_VECTOR_SIZE;
+    return (std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value) ? VECTOR_SIZE : FAD_VECTOR_SIZE;
   }
   
   /**
@@ -803,7 +803,7 @@ namespace Intrepid2 {
   KOKKOS_INLINE_FUNCTION
   constexpr unsigned getScalarDimensionForView(const ViewType &view)
   {
-    return (std::is_pod<typename ViewType::value_type>::value) ? 0 : get_dimension_scalar(view);
+    return (std::is_standard_layout<typename ViewType::value_type>::value && std::is_trivial<typename ViewType::value_type>::value) ? 0 : get_dimension_scalar(view);
   }
 } // end namespace Intrepid2
 
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
index 3bd8ce4c4aee..740cded482a6 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
@@ -510,6 +510,35 @@ namespace
       
       printView(actualResultData.getUnderlyingView3(), out);
     }
+    
+    // now, check that u' A v = v' A' u for arbitrary vectors u,v
+    
+    // set up a second vector (v)
+    auto vector2View = getView<Scalar,DeviceType>("vector2", cellCount, pointCount, spaceDim);
+    auto vector2ViewHost = Kokkos::create_mirror(vector2View);
+    vector2ViewHost(0,0,0) =  3.0;
+    vector2ViewHost(0,0,1) =  2.0;
+    Kokkos::deep_copy(vector2View, vector2ViewHost);
+    
+    Data<Scalar,DeviceType> u_data(vectorView);
+    Data<Scalar,DeviceType> A_data(matrixView);
+    Data<Scalar,DeviceType> v_data(vector2View);
+    
+    auto AvResultData = Data<Scalar,DeviceType>::allocateMatVecResult(A_data, v_data, false);
+    AvResultData.storeMatVec(A_data, v_data, false);
+    
+    auto upAvResultData = Data<Scalar,DeviceType>::allocateDotProductResult(u_data, AvResultData);
+    upAvResultData.storeDotProduct(u_data, AvResultData);
+      
+    auto ApuResultData = Data<Scalar,DeviceType>::allocateMatVecResult(A_data, u_data, true);
+    ApuResultData.storeMatVec(A_data, u_data, true);
+    
+    auto vpAuResultData = Data<Scalar,DeviceType>::allocateDotProductResult(v_data, ApuResultData);
+    vpAuResultData.storeDotProduct(v_data, ApuResultData);
+    
+    testFloatingEquality2(upAvResultData, vpAuResultData, relTol, absTol, out, success);
+    printView(upAvResultData.getUnderlyingView2(), out);
+    printView(vpAuResultData.getUnderlyingView2(), out);
   }
 
 // #pragma mark Data: MatMat
@@ -576,6 +605,82 @@ namespace
     printView(actualResultData.getUnderlyingView3(), out);
   }
 
+/** \brief Data provides matrix-matrix multiplication support.  This method checks correctness of the computed mat-mat for a case arising from taking the outer product of two vectors.
+*/
+  TEUCHOS_UNIT_TEST( Data, MatMatOuterProduct )
+  {
+    double relTol = 1e-13;
+    double absTol = 1e-13;
+    
+    using DeviceType = DefaultTestDeviceType;
+    using Scalar = double;
+    const int spaceDim = 2;
+    const int cellCount = 1;
+    const int pointCount = 1;
+    auto leftVectorView = getView<Scalar,DeviceType>("left vector", cellCount, pointCount, spaceDim);
+    auto leftVectorViewHost = Kokkos::create_mirror(leftVectorView);
+    leftVectorViewHost(0,0,0) = 1.0;
+    leftVectorViewHost(0,0,1) = 0.5;
+    Kokkos::deep_copy(leftVectorView, leftVectorViewHost);
+    
+    Data<Scalar,DeviceType> leftVector(leftVectorView);
+    
+    auto rightVectorView = getView<Scalar,DeviceType>("right vector", cellCount, pointCount, spaceDim);
+    auto rightVectorViewHost = Kokkos::create_mirror(rightVectorView);
+    rightVectorViewHost(0,0,0) = 0.5;
+    rightVectorViewHost(0,0,1) = 1.0;
+    Kokkos::deep_copy(rightVectorView, rightVectorViewHost);
+    Data<Scalar,DeviceType> rightVector(rightVectorView);
+    
+    // re-cast leftVector as a rank 4 (C,P,D,1) object -- a D x 1 matrix at each (C,P).
+    const int newRank   = 4;
+    auto extents        = leftVector.getExtents();
+    auto variationTypes = leftVector.getVariationTypes();
+    auto leftMatrix = leftVector.shallowCopy(newRank, extents, variationTypes);
+    
+    // re-cast rightVector as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P)
+    extents           = rightVector.getExtents();
+    extents[3]        = extents[2];
+    extents[2]        = 1;
+    variationTypes    = rightVector.getVariationTypes();
+    variationTypes[3] = variationTypes[2];
+    variationTypes[2] = CONSTANT;
+    auto rightMatrix  = rightVector.shallowCopy(newRank, extents, variationTypes);
+    
+    auto expectedResultView = getView<Scalar,DeviceType>("result matrix", cellCount, pointCount, spaceDim, spaceDim);
+    auto expectedResultViewHost = Kokkos::create_mirror(expectedResultView);
+    
+    const int cellOrdinal = 0;
+    for (int i=0; i<spaceDim; i++)
+    {
+      for (int j=0; j<spaceDim; j++)
+      {
+        const auto & left  =  leftVectorViewHost(cellOrdinal,0,i);
+        const auto & right = rightVectorViewHost(cellOrdinal,0,j);
+        Scalar result = left * right;
+        expectedResultViewHost(cellOrdinal,0,i,j) = result;
+      }
+    }
+    Kokkos::deep_copy(expectedResultView, expectedResultViewHost);
+    
+    const bool transposeA = false;
+    const bool transposeB = false;
+    
+    auto actualResultData = Data<Scalar,DeviceType>::allocateMatMatResult(transposeA, leftMatrix, transposeB, rightMatrix);
+    
+    TEST_EQUALITY(         4, actualResultData.rank());
+    TEST_EQUALITY( cellCount, actualResultData.extent_int(0));
+    TEST_EQUALITY(pointCount, actualResultData.extent_int(1));
+    TEST_EQUALITY(  spaceDim, actualResultData.extent_int(2));
+    TEST_EQUALITY(  spaceDim, actualResultData.extent_int(3));
+    
+    actualResultData.storeMatMat(transposeA, leftMatrix, transposeB, rightMatrix);
+    
+    testFloatingEquality4(expectedResultView, actualResultData, relTol, absTol, out, success);
+    
+    printView(actualResultData.getUnderlyingView(), out);
+  }
+
 // #pragma mark Data: MatMatExplicitIdentity_PDD
 /** \brief Data provides matrix-matrix multiplication support.  This method checks correctness of the computed mat-mat for several cases involving 3x3 identity matrices.  Here, the logical dimensions (C,P,D,D) differ from the stored dimensions of (P,D,D).  We test each possible transpose combination.
 */
@@ -725,6 +830,48 @@ TEUCHOS_UNIT_TEST( Data, MatMatExplicitIdentity_PDD ) // (P,D,D) underlying; not
     
     printView(actualResultData.getUnderlyingView2(), out);
   }
+
+// #pragma mark Data: VecDotProduct
+/** \brief Data provides vector dot product multiplication support.  This method checks correctness of the computed dot product for a particular case involving 2x1 vectors.
+*/
+  TEUCHOS_UNIT_TEST( Data, VecDotProduct )
+  {
+    double relTol = 1e-13;
+    double absTol = 1e-13;
+    
+    using DeviceType = DefaultTestDeviceType;
+    using Scalar = double;
+    const int numCells = 1;
+    const int spaceDim = 2;
+    
+    auto vec1View = getView<Scalar,DeviceType>("vector", numCells, spaceDim);
+    auto vec1ViewHost = Kokkos::create_mirror(vec1View);
+    
+    vec1ViewHost(0,0) = 1.0;
+    vec1ViewHost(0,1) = 2.0;
+    Kokkos::deep_copy(vec1View, vec1ViewHost);
+    
+    auto vec2View = getView<Scalar,DeviceType>("vector", numCells, spaceDim);
+    auto vec2ViewHost = Kokkos::create_mirror(vec1View);
+    
+    vec2ViewHost(0,0) = 3.0;
+    vec2ViewHost(0,1) = 2.0;
+    Kokkos::deep_copy(vec2View, vec2ViewHost);
+    
+    auto expectedResultView = getView<Scalar,DeviceType>("result",numCells);
+    auto expectedResultViewHost = Kokkos::create_mirror(expectedResultView);
+    
+    expectedResultViewHost(0) = vec1ViewHost(0,0) * vec2ViewHost(0,0) + vec1ViewHost(0,1) * vec2ViewHost(0,1);
+    
+    Kokkos::deep_copy(expectedResultView, expectedResultViewHost);
+    
+    Data<Scalar,DeviceType> vec1Data(vec1View);
+    Data<Scalar,DeviceType> vec2Data(vec2View);
+    auto actualResultData = Data<Scalar,DeviceType>::allocateDotProductResult(vec1Data, vec2Data);
+    actualResultData.storeDotProduct(vec1Data, vec2Data);
+    
+    testFloatingEquality1(expectedResultView, actualResultData.getUnderlyingView1(), relTol, absTol, out, success);
+  }
   
   // test statically that Data supports all 7 rank operators
   static_assert(supports_rank<Data<double,DefaultTestDeviceType>,1>::value, "Data is expected to support up to rank 7");
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_GeneralStandardIntegration.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_GeneralStandardIntegration.cpp
index 257b700bab2f..fd5672916aad 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_GeneralStandardIntegration.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_GeneralStandardIntegration.cpp
@@ -83,6 +83,7 @@ void testStandardIntegration(int meshWidth, int polyOrder, int worksetSize,
   EFunctionSpace fs;
   EOperator op1, op2;
   int numOps = 0; // can be 1 or 2
+  Teuchos::RCP<Kokkos::Array<Scalar,spaceDim>> vectorWeight1, vectorWeight2;
   switch (formulation)
   {
     case Poisson:
@@ -113,12 +114,32 @@ void testStandardIntegration(int meshWidth, int polyOrder, int worksetSize,
       op1 = EOperator::OPERATOR_VALUE;
       fs = EFunctionSpace::FUNCTION_SPACE_HDIV;
       break;
+    case VectorWeightedPoisson:
+      numOps = 1;
+      op1 = EOperator::OPERATOR_GRAD;
+      fs = EFunctionSpace::FUNCTION_SPACE_HGRAD;
+      vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+      vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+      double weight = 1.0;
+      for (int d=0; d<spaceDim; d++)
+      {
+        (*vectorWeight1)[d] = weight;
+        weight /= 2.0;
+      }
+      
+      weight = 0.5;
+      for (int d=0; d<spaceDim; d++)
+      {
+        (*vectorWeight2)[d] = weight;
+        weight *= 2.0;
+      }
+      break;
   }
     
   double flopCountIntegration = 0, flopCountJacobian = 0;
   auto generalIntegrals = performStandardAssembly<Scalar,BasisFamily>(geometry, worksetSize,
-                                                                      polyOrder, fs, op1,
-                                                                      polyOrder, fs, op1,
+                                                                      polyOrder, fs, op1, vectorWeight1,
+                                                                      polyOrder, fs, op1, vectorWeight2,
                                                                       flopCountIntegration, flopCountJacobian);
   if (numOps == 2)
   {
@@ -136,7 +157,7 @@ void testStandardIntegration(int meshWidth, int polyOrder, int worksetSize,
     });
   }
   
-  auto specificIntegrals = performStandardQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian);
+  auto specificIntegrals = performStandardQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian, vectorWeight1, vectorWeight2);
     
   out << "Comparing new general standard assembly implementation to previous formulation-specific integration path…\n";
   testFloatingEquality3(generalIntegrals, specificIntegrals, relTol, absTol, out, success, "general integral", "specific formulation integral");
@@ -167,4 +188,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStandardInteg
 TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStandardIntegration, PoissonFormulation, D2, P3)
 TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStandardIntegration, PoissonFormulation, D3, P3)
 
+TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStandardIntegration, VectorWeightedPoissonFormulation, D1, P1)
+TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStandardIntegration, VectorWeightedPoissonFormulation, D2, P3)
+TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStandardIntegration, VectorWeightedPoissonFormulation, D3, P3)
+
 } // anonymous namespace
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_GeneralStructuredIntegration.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_GeneralStructuredIntegration.cpp
index 7e67f3b15579..7d12fe961809 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_GeneralStructuredIntegration.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_GeneralStructuredIntegration.cpp
@@ -83,6 +83,7 @@ void testStructuredIntegration(int meshWidth, int polyOrder, int worksetSize,
   EFunctionSpace fs;
   EOperator op1, op2;
   int numOps = 0; // can be 1 or 2
+  Teuchos::RCP<Kokkos::Array<Scalar,spaceDim>> vectorWeight1, vectorWeight2;
   switch (formulation)
   {
     case Poisson:
@@ -113,12 +114,32 @@ void testStructuredIntegration(int meshWidth, int polyOrder, int worksetSize,
       op1 = EOperator::OPERATOR_VALUE;
       fs = EFunctionSpace::FUNCTION_SPACE_HDIV;
       break;
+    case VectorWeightedPoisson:
+      numOps = 1;
+      op1 = EOperator::OPERATOR_GRAD;
+      fs = EFunctionSpace::FUNCTION_SPACE_HGRAD;
+      vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+      vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+      double weight = 1.0;
+      for (int d=0; d<spaceDim; d++)
+      {
+        (*vectorWeight1)[d] = weight;
+        weight /= 2.0;
+      }
+      
+      weight = 0.5;
+      for (int d=0; d<spaceDim; d++)
+      {
+        (*vectorWeight2)[d] = weight;
+        weight *= 2.0;
+      }
+      break;
   }
     
   double flopCountIntegration = 0, flopCountJacobian = 0;
   auto generalIntegrals = performStructuredAssembly<Scalar,BasisFamily>(geometry, worksetSize,
-                                                                        polyOrder, fs, op1,
-                                                                        polyOrder, fs, op1,
+                                                                        polyOrder, fs, op1, vectorWeight1,
+                                                                        polyOrder, fs, op1, vectorWeight2,
                                                                         flopCountIntegration, flopCountJacobian);
   if (numOps == 2)
   {
@@ -136,7 +157,7 @@ void testStructuredIntegration(int meshWidth, int polyOrder, int worksetSize,
     });
   }
   
-  auto specificIntegrals = performStructuredQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian);
+  auto specificIntegrals = performStructuredQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian, vectorWeight1, vectorWeight2);
     
   out << "Comparing new general standard assembly implementation to previous formulation-specific integration path…\n";
   testFloatingEquality3(generalIntegrals, specificIntegrals, relTol, absTol, out, success, "general integral", "specific formulation integral");
@@ -167,4 +188,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStructuredInt
 TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStructuredIntegration, PoissonFormulation, D2, P3)
 TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStructuredIntegration, PoissonFormulation, D3, P3)
 
+TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStructuredIntegration, VectorWeightedPoissonFormulation, D1, P1)
+TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStructuredIntegration, VectorWeightedPoissonFormulation, D2, P3)
+TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(StructuredIntegration, GeneralStructuredIntegration, VectorWeightedPoissonFormulation, D3, P3)
+
 } // anonymous namespace
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_QuadratureUniformMesh.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_QuadratureUniformMesh.cpp
index cad8b2a13534..ab1e182c0417 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_QuadratureUniformMesh.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_QuadratureUniformMesh.cpp
@@ -74,11 +74,31 @@ namespace
       gridCellCounts[d] = meshWidth;
     }
     
+    Teuchos::RCP<Kokkos::Array<Scalar,spaceDim>> vectorWeight1, vectorWeight2;
+    if (formulation == VectorWeightedPoisson)
+    {
+      vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+      vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+      double weight = 1.0;
+      for (int d=0; d<spaceDim; d++)
+      {
+        (*vectorWeight1)[d] = weight;
+        weight /= 2.0;
+      }
+      
+      weight = 0.5;
+      for (int d=0; d<spaceDim; d++)
+      {
+        (*vectorWeight2)[d] = weight;
+        weight *= 2.0;
+      }
+    }
+    
     auto geometry = getMesh<PointScalar, spaceDim, DeviceType>(algorithm, gridCellCounts);
     double flopCountIntegration = 0, flopCountJacobian = 0;
-    auto standardIntegrals = performStandardQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian);
+    auto standardIntegrals = performStandardQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian, vectorWeight1, vectorWeight2);
     
-    auto structuredIntegrals = performStructuredQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian);
+    auto structuredIntegrals = performStructuredQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian, vectorWeight1, vectorWeight2);
     
     out << "Comparing standard Intrepid2 integration to new integration path…\n";
     testFloatingEquality3(standardIntegrals, structuredIntegrals, relTol, absTol, out, success, "standard Intrepid2 integral", "structured integral");
@@ -108,170 +128,179 @@ namespace
 
   // comparisons are to Standard algorithm, so we don't instantiate with Standard:
   // 1D, p=1 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D1, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D1, P1)
   // 1D, p=2 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D1, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D1, P2)
   // 1D, p=4 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D1, P4)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D1, P4)
 
   // 2D, p=1 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D2, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D2, P1)
   // 2D, p=2 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D2, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D2, P2)
   // 2D, p=3 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D2, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineTensorAlgorithm,    D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  NonAffineTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineNonTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  UniformAlgorithm,         D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineTensorAlgorithm,    D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 NonAffineTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineNonTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 UniformAlgorithm,         D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D2, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D2, P3)
 
-  // 3D, p=1 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D3, P1)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D3, P1)
+  // 3D, p=1               tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D3, P1)
   // 3D, p=2 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D3, P2)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D3, P2)
   // 3D, p=3 tests:
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D3, P3)
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation,               UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,                 UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,                  UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,                 UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,                    UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, VectorWeightedPoissonFormulation, NonAffineTensorAlgorithm, D3, P3)
 } // anonymous namespace
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_StructuredVersusStandard.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_StructuredVersusStandard.cpp
index 28ffcb37b3bd..27059b43e728 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_StructuredVersusStandard.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_StructuredVersusStandard.cpp
@@ -64,8 +64,8 @@ namespace
 
 template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
 void testStandardVersusStructuredIntegration(const int &meshWidth, const int &worksetSize,
-                                             const EFunctionSpace &fs1, const EOperator &op1, const int &p1,
-                                             const EFunctionSpace &fs2, const EOperator &op2, const int &p2,
+                                             const EFunctionSpace &fs1, const EOperator &op1, const int &p1, Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim> > vectorWeight1,
+                                             const EFunctionSpace &fs2, const EOperator &op2, const int &p2, Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim> > vectorWeight2,
                                              const double &relTol, const double &absTol,
                                              Teuchos::FancyOStream &out, bool &success)
 {
@@ -84,19 +84,32 @@ void testStandardVersusStructuredIntegration(const int &meshWidth, const int &wo
   
   double flopCountIntegration = 0, flopCountJacobian = 0;
   auto structuredIntegrals = performStructuredAssembly<Scalar,BasisFamily>(geometry, worksetSize,
-                                                                           p1, fs1, op1,
-                                                                           p2, fs2, op2,
+                                                                           p1, fs1, op1, vectorWeight1,
+                                                                           p2, fs2, op2, vectorWeight2,
                                                                            flopCountIntegration, flopCountJacobian);
   
   auto standardIntegrals = performStandardAssembly<Scalar,BasisFamily>(geometry, worksetSize,
-                                                                       p1, fs1, op1,
-                                                                       p2, fs2, op2,
+                                                                       p1, fs1, op1, vectorWeight1,
+                                                                       p2, fs2, op2, vectorWeight2,
                                                                        flopCountIntegration, flopCountJacobian);
     
   out << "Comparing general standard assembly to structured integration path…\n";
   testFloatingEquality3(standardIntegrals, structuredIntegrals, relTol, absTol, out, success, "standard integral", "structured formulation integral");
 }
 
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+void testStandardVersusStructuredIntegration(const int &meshWidth, const int &worksetSize,
+                                             const EFunctionSpace &fs1, const EOperator &op1, const int &p1,
+                                             const EFunctionSpace &fs2, const EOperator &op2, const int &p2,
+                                             const double &relTol, const double &absTol,
+                                             Teuchos::FancyOStream &out, bool &success)
+{
+  testStandardVersusStructuredIntegration<Scalar, BasisFamily, PointScalar, spaceDim, DeviceType>(meshWidth, worksetSize,
+                                                                                                  fs1, op1, p1, Teuchos::null,
+                                                                                                  fs2, op2, p2, Teuchos::null,
+                                                                                                  relTol, absTol, out, success);
+}
+
 TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandard_D1_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
 {
   using DataScalar  = double;
@@ -322,6 +335,381 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandar
     (meshWidth, worksetSize, fs1, op1, p1, fs2, op2, p2, relTol, absTol, out, success);
 }
 
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardVectorWeighted_D1_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 1;
+  const int p1 = 1;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  auto vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  auto vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight1)[d] = weight;
+    weight /= 2.0;
+  }
+  
+  weight = 0.5;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight2)[d] = weight;
+    weight *= 2.0;
+  }
+
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardVectorWeighted_D2_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 2;
+  const int p1 = 1;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  auto vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  auto vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight1)[d] = weight;
+    weight /= 2.0;
+  }
+  
+  weight = 0.5;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight2)[d] = weight;
+    weight *= 2.0;
+  }
+
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardVectorWeighted_D2_P2_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 2;
+  const int p1 = 2;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  auto vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  auto vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight1)[d] = weight;
+    weight /= 2.0;
+  }
+  
+  weight = 0.5;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight2)[d] = weight;
+    weight *= 2.0;
+  }
+
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D1_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 1;
+  const int p1 = 1;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  Teuchos::RCP<Kokkos::Array<double,spaceDim> > vectorWeight1; // no vector weight on scalar term
+  auto vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight2)[d] = weight;
+    weight /= 2.0;
+  }
+
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D2_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 2;
+  const int p1 = 1;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  Teuchos::RCP<Kokkos::Array<double,spaceDim> > vectorWeight1; // no vector weight on scalar term
+  auto vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight2)[d] = weight;
+    weight /= 2.0;
+  }
+
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D3_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 3;
+  const int p1 = 1;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  Teuchos::RCP<Kokkos::Array<double,spaceDim> > vectorWeight1; // no vector weight on scalar term
+  auto vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight2)[d] = weight;
+    weight /= 2.0;
+  }
+
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D1_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 1;
+  const int p1 = 1;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  auto vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight1)[d] = weight;
+    weight /= 2.0;
+  }
+  Teuchos::RCP<Kokkos::Array<double,spaceDim> > vectorWeight2; // no vector weight on scalar term
+  
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D2_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 2;
+  const int p1 = 1;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  auto vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight1)[d] = weight;
+    weight /= 2.0;
+  }
+  Teuchos::RCP<Kokkos::Array<double,spaceDim> > vectorWeight2; // no vector weight on scalar term
+  
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D3_P1_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 3;
+  const int p1 = 1;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  auto vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight1)[d] = weight;
+    weight /= 2.0;
+  }
+  Teuchos::RCP<Kokkos::Array<double,spaceDim> > vectorWeight2; // no vector weight on scalar term
+  
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
+TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, StructuredVersusStandardVectorWeighted_D3_P2_P1, FS1Tag, Op1Tag, FS2Tag, Op2Tag)
+{
+  using DataScalar  = double;
+  using PointScalar = double;
+  const int meshWidth = 1;
+  const int spaceDim = 3;
+  const int p1 = 2;
+  const int p2 = 1;
+  const int worksetSize = meshWidth;
+  
+  auto vectorWeight1 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  auto vectorWeight2 = Teuchos::rcp(new Kokkos::Array<double,spaceDim>);
+  
+  double weight = 1.0;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight1)[d] = weight;
+    weight /= 2.0;
+  }
+  
+  weight = 0.5;
+  for (int d=0; d<spaceDim; d++)
+  {
+    (*vectorWeight2)[d] = weight;
+    weight *= 2.0;
+  }
+
+  using DeviceType = DefaultTestDeviceType;
+  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+  
+  const EFunctionSpace fs1 = FS1Tag::functionSpace;
+  const EFunctionSpace fs2 = FS2Tag::functionSpace;
+  const EOperator op1 = Op1Tag::op;
+  const EOperator op2 = Op2Tag::op;
+  
+  double relTol = 1e-12;
+  double absTol = 1e-12;
+  
+  testStandardVersusStructuredIntegration<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>
+    (meshWidth, worksetSize, fs1, op1, p1, vectorWeight1, fs2, op2, p2, vectorWeight2, relTol, absTol, out, success);
+}
+
 // asymmetric tests (mostly -- a couple symmetric ones tossed in as sanity checks on the test itself)
 
 // 1D tests: H(grad) and H(vol) bases defined
@@ -338,6 +726,17 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStan
 TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandard_D1_P2_P1, HGRAD, VALUE, HGRAD, VALUE)
 TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandard_D1_P2_P1, HVOL,  VALUE, HGRAD, VALUE)
 
+// 1D vector-weighted test
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorWeighted_D1_P1_P1, HGRAD, GRAD, HGRAD, GRAD)
+
+// 1D scalar against vector-weighted tests
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D1_P1_P1, HVOL, VALUE, HGRAD, GRAD)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D1_P1_P1, HGRAD, VALUE, HGRAD, GRAD)
+
+// 1D vector-weighted against scalar tests
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D1_P1_P1, HGRAD, GRAD, HVOL,  VALUE)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D1_P1_P1, HGRAD, GRAD, HGRAD, VALUE)
+
 // 2D tests: curls of H(curl) are scalars.
 // p1, p1:
 TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandard_D2_P1_P1, HGRAD, GRAD,  HGRAD, GRAD)
@@ -367,6 +766,22 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStan
 TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandard_D2_P1_P2, HCURL, CURL,  HVOL,  VALUE)
 TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandard_D2_P1_P2, HVOL,  VALUE, HGRAD, VALUE)
 
+// 2D vector-weighted tests
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorWeighted_D2_P1_P1, HGRAD, GRAD, HGRAD, GRAD)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorWeighted_D2_P2_P1, HGRAD, GRAD, HGRAD, GRAD)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorWeighted_D2_P1_P1, HCURL, VALUE, HDIV,  VALUE)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorWeighted_D2_P2_P1, HCURL, VALUE, HDIV,  VALUE)
+
+// 2D scalar against vector-weighted tests
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D2_P1_P1, HVOL, VALUE, HGRAD, GRAD)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D2_P1_P1, HGRAD, VALUE, HGRAD, GRAD)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D2_P1_P1, HGRAD, VALUE, HDIV, VALUE)
+
+// 2D vector-weighted against scalar tests
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D2_P1_P1, HGRAD, GRAD, HVOL,  VALUE)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D2_P1_P1, HGRAD, GRAD, HGRAD, VALUE)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D2_P1_P1, HDIV, VALUE, HGRAD, VALUE)
+
 // 3D tests: curls of H(curl) are vectors
 // p1, p1:
 TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandard_D3_P1_P1, HGRAD, GRAD,  HGRAD, GRAD)
@@ -396,5 +811,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStan
 TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandard_D3_P1_P2, HCURL, CURL,  HDIV,  VALUE)
 TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandard_D3_P1_P2, HVOL,  VALUE, HGRAD, VALUE)
 
+// 3D vector-weighted tests
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorWeighted_D3_P2_P1, HGRAD, GRAD,  HGRAD, GRAD)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorWeighted_D3_P2_P1, HCURL, VALUE, HDIV,  VALUE)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorWeighted_D3_P2_P1, HCURL, CURL,  HGRAD, GRAD)
+
+// 3D scalar against vector-weighted tests
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D3_P1_P1, HVOL, VALUE, HGRAD, GRAD)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D3_P1_P1, HGRAD, VALUE, HGRAD, GRAD)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardScalarAgainstVectorDotVector_D3_P1_P1, HGRAD, VALUE, HDIV, VALUE)
+
+// 3D vector-weighted against scalar tests
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D3_P1_P1, HGRAD, GRAD, HVOL,  VALUE)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D3_P1_P1, HGRAD, GRAD, HGRAD, VALUE)
+TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, StructuredVersusStandardVectorDotVectorAgainstScalar_D3_P1_P1, HDIV, VALUE, HGRAD, VALUE)
 
 } // anonymous namespace
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_TagDefs.hpp b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_TagDefs.hpp
index bb1ce87fd872..fbafa35407d4 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_TagDefs.hpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_TagDefs.hpp
@@ -23,7 +23,8 @@ enum FormulationChoice
   Hgrad,   // (grad, grad) + (value, value)
   Hdiv,    // (div, div)   + (value, value)
   Hcurl,   // (curl, curl) + (value, value)
-  L2       // (value, value)
+  L2,      // (value, value)
+  VectorWeightedPoisson // (a dot grad, b dot grad)
 };
 
 enum AlgorithmChoice
@@ -64,6 +65,10 @@ class L2Formulation {
 public:
   static const FormulationChoice formulation = L2;
 };
+class VectorWeightedPoissonFormulation {
+public:
+  static const FormulationChoice formulation = VectorWeightedPoisson;
+};
 class StandardAlgorithm
 {
 public:
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_Utils.hpp b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_Utils.hpp
index 799fa0135efe..513de612af3d 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_Utils.hpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests_Utils.hpp
@@ -26,6 +26,8 @@
 #include "HCURLStructuredAssembly.hpp"
 #include "HVOLStandardAssembly.hpp"
 #include "HVOLStructuredAssembly.hpp"
+#include "VectorWeightedGRADGRADStandardAssembly.hpp"
+#include "VectorWeightedGRADGRADStructuredAssembly.hpp"
 
 template< typename PointScalar, int spaceDim, typename DeviceType >
 inline
@@ -65,10 +67,12 @@ CellGeometry<PointScalar, spaceDim, DeviceType> getMesh(AlgorithmChoice algorith
   return uniformTensorGeometry; // this line should be unreachable; included to avoid compiler warnings from nvcc
 }
 
-template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType, unsigned long spaceDim2 = spaceDim>  // spaceDim and spaceDim2 should agree on value (differ on type)
 Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadrature(FormulationChoice formulation,
-                                        Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
-                                        double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+                                                                   Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                   double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount,
+                                                                   Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim2> > vectorWeight1 = Teuchos::null,
+                                                                   Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim2> > vectorWeight2 = Teuchos::null)
 {
   switch (formulation)
   {
@@ -82,15 +86,19 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadrature(FormulationCh
       return performStandardQuadratureHCURL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
     case L2:
       return performStandardQuadratureHVOL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case VectorWeightedPoisson:
+      return performStandardQuadratureVectorWeightedGRADGRAD<Scalar,BasisFamily>(geometry, polyOrder, worksetSize, vectorWeight1, vectorWeight2, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
     default:
       INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported formulation");
   }
 }
 
-template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType, unsigned long spaceDim2 = spaceDim> // spaceDim and spaceDim2 should agree on value (differ on type)
 Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadrature(FormulationChoice formulation,
-                                          Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
-                                          double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+                                                                     Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                     double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount,
+                                                                     Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim2> > vectorWeight1 = Teuchos::null,
+                                                                     Teuchos::RCP< Kokkos::Array<PointScalar,spaceDim2> > vectorWeight2 = Teuchos::null)
 {
   switch (formulation)
   {
@@ -104,6 +112,8 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadrature(Formulation
       return performStructuredQuadratureHCURL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
     case L2:
       return performStructuredQuadratureHVOL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case VectorWeightedPoisson:
+      return performStructuredQuadratureVectorWeightedGRADGRAD<Scalar,BasisFamily>(geometry, polyOrder, worksetSize, vectorWeight1, vectorWeight2, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
     default:
       INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported formulation");
   }
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp
index d9c388910cfb..55772e28a89d 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp
@@ -8,7 +8,7 @@
 // @HEADER
 
 
-/** \file   TransformedVectorDataTests.cpp
+/** \file   TransformedBasisValuesTests.cpp
     \brief  Tests against TransformedBasisValues.
     \author Created by Nate Roberts
 */
@@ -341,6 +341,195 @@ namespace
     testFloatingEquality4(transformedGradValues, transformedGradientData, relTol, absTol, out, success);
   }
 
+  // testVectorWeightedTransformation tests against a (C,P,D) transformation of a gradient field.
+  template<int spaceDim>
+  void testWeightedVectorTransformation(const int &polyOrder, const int &meshWidth, Teuchos::FancyOStream &out, bool &success)
+  {
+    using DeviceType = DefaultTestDeviceType;
+    using Scalar = double;
+    using PointScalar = double;
+    
+    const double relTol = 1e-12;
+    const double absTol = 1e-12;
+    
+    auto fs = Intrepid2::FUNCTION_SPACE_HGRAD;
+    
+    auto lineBasis = Intrepid2::getLineBasis< Intrepid2::NodalBasisFamily<DeviceType> >(fs, polyOrder);
+    
+    int numFields_1D = lineBasis->getCardinality();
+    
+    int numFields = 1;
+    int numHypercubes = 1;
+    for (int d=0; d<spaceDim; d++)
+    {
+      numHypercubes *= meshWidth;
+      numFields     *= numFields_1D;
+    }
+    int numCells = numHypercubes;
+      
+    shards::CellTopology lineTopo = shards::getCellTopologyData< shards::Line<> >();
+    shards::CellTopology cellTopo;
+    if      (spaceDim == 1) cellTopo = shards::getCellTopologyData< shards::Line<>          >();
+    else if (spaceDim == 2) cellTopo = shards::getCellTopologyData< shards::Quadrilateral<> >();
+    else if (spaceDim == 3) cellTopo = shards::getCellTopologyData< shards::Hexahedron<>    >();
+    
+    auto lineCubature = Intrepid2::DefaultCubatureFactory::create<DeviceType>(lineTopo,polyOrder*2);
+    int numPoints_1D = lineCubature->getNumPoints();
+    ScalarView<PointScalar,DeviceType> lineCubaturePoints("line cubature points",numPoints_1D,1);
+    ScalarView<double,DeviceType> lineCubatureWeights("line cubature weights", numPoints_1D);
+    
+    lineCubature->getCubature(lineCubaturePoints, lineCubatureWeights);
+    
+    // Allocate some intermediate containers
+    ScalarView<Scalar,DeviceType> lineBasisValues    ("line basis values",      numFields_1D, numPoints_1D   );
+    ScalarView<Scalar,DeviceType> lineBasisGradValues("line basis grad values", numFields_1D, numPoints_1D, 1);
+    
+    // for now, we use 1D values to build up the 2D or 3D gradients
+    // eventually, TensorBasis should offer a getValues() variant that returns tensor basis data
+    lineBasis->getValues(lineBasisValues,     lineCubaturePoints, Intrepid2::OPERATOR_VALUE );
+    lineBasis->getValues(lineBasisGradValues, lineCubaturePoints, Intrepid2::OPERATOR_GRAD  );
+    
+    // drop the trivial space dimension in line gradient values:
+    Kokkos::resize(lineBasisGradValues, numFields_1D, numPoints_1D);
+      
+    Kokkos::Array<TensorData<Scalar,DeviceType>, spaceDim> vectorComponents;
+    
+    for (int d=0; d<spaceDim; d++)
+    {
+      Kokkos::Array<Data<Scalar,DeviceType>, spaceDim> gradComponent_d;
+      for (int d2=0; d2<spaceDim; d2++)
+      {
+        if (d2 == d) gradComponent_d[d2] = Data<Scalar,DeviceType>(lineBasisGradValues);
+        else         gradComponent_d[d2] = Data<Scalar,DeviceType>(lineBasisValues);
+      }
+      vectorComponents[d] = TensorData<Scalar,DeviceType>(gradComponent_d);
+    }
+    VectorData<Scalar,DeviceType> gradientVectorData(vectorComponents, false); // false: not axis-aligned
+    BasisValues<Scalar, DeviceType> gradientValues(gradientVectorData);
+    
+    CellGeometry<PointScalar,spaceDim,DeviceType> cellNodes = uniformCartesianMesh<PointScalar,spaceDim,DeviceType>(1.0, meshWidth);
+    
+    // goal here is to do a vector-weighted Poisson; i.e. (f a_u \cdot grad u, a_v \cdot grad v) on each cell
+    
+    int pointsPerCell = 1;
+    for (int d=0; d<spaceDim; d++)
+    {
+      pointsPerCell *= numPoints_1D;
+    }
+    
+    auto jacobian = cellNodes.allocateJacobianData(pointsPerCell);
+    auto jacobianDet = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+    auto jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
+    cellNodes.setJacobian(                   jacobian, pointsPerCell);
+    CellTools<DeviceType>::setJacobianDet(jacobianDet, jacobian);
+    CellTools<DeviceType>::setJacobianInv(jacobianInv, jacobian);
+    
+    auto auView = getView<Scalar,DeviceType>("a_u", spaceDim);
+    auto auViewHost = Kokkos::create_mirror(auView);
+    double weight = 1.0;
+    for (int d=0; d<spaceDim; d++)
+    {
+      auViewHost(d) = weight;
+      weight /= 2.0;
+    }
+    Kokkos::deep_copy(auView, auViewHost);
+    
+    auto avView = getView<Scalar,DeviceType>("a_v", spaceDim);
+    auto avViewHost = Kokkos::create_mirror(avView);
+    weight = 0.5;
+    for (int d=0; d<spaceDim; d++)
+    {
+      avViewHost(d) = weight;
+      weight *= 2.0;
+    }
+    Kokkos::deep_copy(avView, avViewHost);
+    
+    Data<Scalar,DeviceType> au_data(auView, Kokkos::Array<int,3>{numCells,pointsPerCell,spaceDim}, Kokkos::Array<DataVariationType,3>{CONSTANT,CONSTANT,GENERAL});
+    Data<Scalar,DeviceType> av_data(avView, Kokkos::Array<int,3>{numCells,pointsPerCell,spaceDim}, Kokkos::Array<DataVariationType,3>{CONSTANT,CONSTANT,GENERAL});
+    
+    auto uTransform = Data<Scalar,DeviceType>::allocateMatVecResult(jacobianInv, au_data, true);
+    auto vTransform = Data<Scalar,DeviceType>::allocateMatVecResult(jacobianInv, av_data, true);
+    
+    uTransform.storeMatVec(jacobianInv, au_data, true); // true: transpose jacobianInv when multiplying
+    vTransform.storeMatVec(jacobianInv, av_data, true); // true: transpose jacobianInv when multiplying
+    
+    Intrepid2::TransformedBasisValues<double, DeviceType> utransformedBasisGradients(uTransform, gradientValues);
+    Intrepid2::TransformedBasisValues<double, DeviceType> vtransformedBasisGradients(vTransform, gradientValues);
+    
+    int numPoints = 1;
+    for (int d=0; d<spaceDim; d++)
+    {
+      numPoints *= numPoints_1D;
+    }
+    
+    // now, compute transformed values in the classic, expanded way
+    ScalarView<Scalar,DeviceType> expanded_uTransformedGradValues("transformed a_u dot grad values", numCells, numFields, numPoints);
+    ScalarView<Scalar,DeviceType> expanded_vTransformedGradValues("transformed a_v dot grad values", numCells, numFields, numPoints);
+    
+    auto basis = Intrepid2::getBasis< Intrepid2::NodalBasisFamily<DeviceType> >(cellTopo, fs, polyOrder);
+    
+    // Allocate some intermediate containers
+    ScalarView<Scalar,DeviceType> basisValues    ("basis values", numFields, numPoints );
+    ScalarView<Scalar,DeviceType> basisGradValues("basis grad values", numFields, numPoints, spaceDim);
+
+    ScalarView<Scalar,DeviceType> transformedGradValues("transformed grad values", numCells, numFields, numPoints, spaceDim);
+    ScalarView<Scalar,DeviceType> transformedWeightedGradValues("transformed weighted grad values", numCells, numFields, numPoints, spaceDim);
+    
+    auto cubature = Intrepid2::DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+    TEST_EQUALITY( numPoints, cubature->getNumPoints());
+    ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
+    ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);
+    
+    cubature->getCubature(cubaturePoints, cubatureWeights);
+    
+    basis->getValues(basisValues,     cubaturePoints, Intrepid2::OPERATOR_VALUE );
+    basis->getValues(basisGradValues, cubaturePoints, Intrepid2::OPERATOR_GRAD  );
+    
+    const int numNodesPerCell = cellNodes.numNodesPerCell();
+    ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
+    
+    using ExecutionSpace = typename DeviceType::execution_space;
+    auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{numCells,numNodesPerCell});
+    Kokkos::parallel_for("fill expanded cell nodes", policy,
+    KOKKOS_LAMBDA (const int &cellOrdinal, const int &nodeOrdinal)
+    {
+      for (int d=0; d<spaceDim; d++)
+      {
+        expandedCellNodes(cellOrdinal,nodeOrdinal,d) = cellNodes(cellOrdinal,nodeOrdinal,d);
+      }
+    });
+    
+    ScalarView<Scalar,DeviceType> expandedJacobian("jacobian", numCells, numPoints, spaceDim, spaceDim);
+    ScalarView<Scalar,DeviceType> expandedJacobianInverse("jacobian inverse", numCells, numPoints, spaceDim, spaceDim);
+    
+    using CellTools = Intrepid2::CellTools<DeviceType>;
+    using ExecutionSpace = typename DeviceType::execution_space;
+    using FunctionSpaceTools = Intrepid2::FunctionSpaceTools<DeviceType>;
+    
+    CellTools::setJacobian(expandedJacobian, cubaturePoints, expandedCellNodes, cellTopo);
+    CellTools::setJacobianInv(expandedJacobianInverse, expandedJacobian);
+    
+    FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, expandedJacobianInverse, basisGradValues);
+    
+    auto policy3 = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCells,numFields,numPoints});
+    Kokkos::parallel_for("compute expanded_{u,v}TransformedGradValues", policy3,
+    KOKKOS_LAMBDA (const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal)
+    {
+      Scalar u_result = 0;
+      Scalar v_result = 0;
+      for (int d=0; d<spaceDim; d++)
+      {
+        u_result += auView(d) * transformedGradValues(cellOrdinal,fieldOrdinal,pointOrdinal,d);
+        v_result += avView(d) * transformedGradValues(cellOrdinal,fieldOrdinal,pointOrdinal,d);
+      }
+      expanded_uTransformedGradValues(cellOrdinal,fieldOrdinal,pointOrdinal) = u_result;
+      expanded_vTransformedGradValues(cellOrdinal,fieldOrdinal,pointOrdinal) = v_result;
+    });
+    
+    testFloatingEquality3(expanded_uTransformedGradValues, utransformedBasisGradients, relTol, absTol, out, success);
+    testFloatingEquality3(expanded_vTransformedGradValues, vtransformedBasisGradients, relTol, absTol, out, success);
+  }
+
   TEUCHOS_UNIT_TEST( TransformedBasisValues, MultiplyByPointWeights_Vector_Identity_Matrix )
   {
     const bool vectorValues      = true;
@@ -436,4 +625,36 @@ namespace
    const int meshWidth = 3;
    testVectorTransformation<spaceDim>(polyOrder, meshWidth, out, success);
  }
+
+  TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedWeightedVector_1D_p1 )
+  {
+    const int spaceDim = 1;
+    const int polyOrder = 1;
+    const int meshWidth = 10;
+    testWeightedVectorTransformation<spaceDim>(polyOrder, meshWidth, out, success);
+  }
+   
+  TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedWeightedVector_1D_p2 )
+  {
+   const int spaceDim = 1;
+   const int polyOrder = 2;
+   const int meshWidth = 10;
+   testWeightedVectorTransformation<spaceDim>(polyOrder, meshWidth, out, success);
+  }
+
+  TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedWeightedVector_2D_p1 )
+  {
+   const int spaceDim = 2;
+   const int polyOrder = 1;
+   const int meshWidth = 3;
+   testWeightedVectorTransformation<spaceDim>(polyOrder, meshWidth, out, success);
+  }
+
+  TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedWeightedVector_2D_p2 )
+  {
+   const int spaceDim = 2;
+   const int polyOrder = 2;
+   const int meshWidth = 3;
+   testWeightedVectorTransformation<spaceDim>(polyOrder, meshWidth, out, success);
+  }
 } // anonymous namespace
diff --git a/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp b/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp
index ab83eda5d694..1d708f698aca 100644
--- a/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp
+++ b/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp
@@ -36,6 +36,8 @@
 #include "HCURLStructuredAssembly.hpp"
 #include "HVOLStandardAssembly.hpp"
 #include "HVOLStructuredAssembly.hpp"
+#include "VectorWeightedGRADGRADStandardAssembly.hpp"
+#include "VectorWeightedGRADGRADStructuredAssembly.hpp"
 
 enum FormulationChoice
 {
@@ -44,6 +46,7 @@ enum FormulationChoice
   Hdiv,    // (div, div)   + (value, value)
   Hcurl,   // (curl, curl) + (value, value)
   L2,      // (value, value)
+  VectorWeightedPoisson,
   UnknownFormulation
 };
 
@@ -81,11 +84,12 @@ std::string to_string(AlgorithmChoice choice)
 std::string to_string(FormulationChoice choice)
 {
   switch (choice) {
-    case Poisson: return "Poisson";
-    case Hgrad:   return "Hgrad";
-    case Hdiv:    return "Hdiv";
-    case Hcurl:   return "Hcurl";
-    case L2:      return "L2";
+    case Poisson:               return "Poisson";
+    case Hgrad:                 return "Hgrad";
+    case Hdiv:                  return "Hdiv";
+    case Hcurl:                 return "Hcurl";
+    case L2:                    return "L2";
+    case VectorWeightedPoisson: return "VectorWeightedPoisson";
     
     default:      return "Unknown FormulationChoice";
   }
@@ -230,10 +234,12 @@ getMeshWidths(int basisCardinality, int maxStiffnessEntryCount, int maxElements)
   return meshWidths;
 }
 
-template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType, unsigned long spaceDim2=spaceDim>
 Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadrature(FormulationChoice formulation,
-                                        Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
-                                        double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+                                                                   Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                   double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount,
+                                                                   Teuchos::RCP<Kokkos::Array<Scalar,spaceDim2>> vectorWeight1 = Teuchos::null,
+                                                                   Teuchos::RCP<Kokkos::Array<Scalar,spaceDim2>> vectorWeight2 = Teuchos::null)
 {
   switch (formulation)
   {
@@ -247,15 +253,19 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadrature(FormulationCh
       return performStandardQuadratureHCURL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
     case L2:
       return performStandardQuadratureHVOL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case VectorWeightedPoisson:
+      return performStandardQuadratureVectorWeightedGRADGRAD<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, vectorWeight1, vectorWeight2, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
     default:
       return Intrepid2::ScalarView<Scalar,DeviceType>();
   }
 }
 
-template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType, unsigned long spaceDim2=spaceDim>
 Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadrature(FormulationChoice formulation,
-                                          Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
-                                          double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+                                                                     Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                     double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount,
+                                                                     Teuchos::RCP<Kokkos::Array<Scalar,spaceDim2>> vectorWeight1 = Teuchos::null,
+                                                                     Teuchos::RCP<Kokkos::Array<Scalar,spaceDim2>> vectorWeight2 = Teuchos::null)
 {
   switch (formulation)
   {
@@ -269,6 +279,8 @@ Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadrature(Formulation
       return performStructuredQuadratureHCURL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
     case L2:
       return performStructuredQuadratureHVOL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case VectorWeightedPoisson:
+      return performStructuredQuadratureVectorWeightedGRADGRAD<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, vectorWeight1, vectorWeight2, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
     default:
       return Intrepid2::ScalarView<Scalar,DeviceType>();
   }
@@ -280,12 +292,13 @@ typename BasisFamily::BasisPtr getBasisForFormulation(FormulationChoice formulat
   Intrepid2::EFunctionSpace fs;
   switch (formulation)
   {
-    case Poisson: fs = FUNCTION_SPACE_HGRAD; break;
-    case Hgrad:   fs = FUNCTION_SPACE_HGRAD; break;
-    case Hdiv:    fs = FUNCTION_SPACE_HDIV;  break;
-    case Hcurl:   fs = FUNCTION_SPACE_HCURL; break;
-    case L2:      fs = FUNCTION_SPACE_HVOL;  break;
-    case UnknownFormulation: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unknown formulation");
+    case Poisson:               fs = FUNCTION_SPACE_HGRAD; break;
+    case Hgrad:                 fs = FUNCTION_SPACE_HGRAD; break;
+    case Hdiv:                  fs = FUNCTION_SPACE_HDIV;  break;
+    case Hcurl:                 fs = FUNCTION_SPACE_HCURL; break;
+    case L2:                    fs = FUNCTION_SPACE_HVOL;  break;
+    case VectorWeightedPoisson: fs = FUNCTION_SPACE_HGRAD; break;
+    case UnknownFormulation:    INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unknown formulation");
   }
   
   auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
@@ -350,7 +363,7 @@ map<tuple<Mode,FormulationChoice,AlgorithmChoice>,map<int,int> > getWorksetSizeM
   map<tuple<Mode,FormulationChoice,AlgorithmChoice>,map<int,int> > worksetSizeMap; // keys are maps p -> worksetSize
   
   vector<AlgorithmChoice> allAlgorithmChoices {Standard, NonAffineTensor, AffineTensor, Uniform};
-  vector<FormulationChoice> allFormulationChoices {Poisson, Hgrad, Hdiv, Hcurl, L2};
+  vector<FormulationChoice> allFormulationChoices {Poisson, Hgrad, Hdiv, Hcurl, L2, VectorWeightedPoisson};
   
   // skip calibration case; want that to span workset sizes in a particular way…
   vector<Mode> allModes {Test,BestSerial,BestOpenMP_16,BestCuda,Precalibrated};
@@ -590,6 +603,48 @@ map<tuple<Mode,FormulationChoice,AlgorithmChoice>,map<int,int> > getWorksetSizeM
           worksetSizeMap[affineTensorKey][7] =     1;
           worksetSizeMap[affineTensorKey][8] =     1;
         }
+        {
+          // VectorWeightedPoisson
+          // These calibrations were run 5-25-24 on an M2 Ultra, on a fork expected to be merged into Trilinos develop soon.
+          FormulationChoice formulation = VectorWeightedPoisson;
+          tuple<Mode,FormulationChoice,AlgorithmChoice> standardKey {mode,formulation,Standard};
+          tuple<Mode,FormulationChoice,AlgorithmChoice> nonAffineTensorKey {mode,formulation,NonAffineTensor};
+          tuple<Mode,FormulationChoice,AlgorithmChoice> affineTensorKey {mode,formulation,AffineTensor};
+          
+          // best for VectorWeightedPoisson - these are for meshes that range from 32,768 for p=1 to 128 for p=10
+          worksetSizeMap[standardKey][1]  = 4096;
+          worksetSizeMap[standardKey][2]  = 1024;
+          worksetSizeMap[standardKey][3]  =   32;
+          worksetSizeMap[standardKey][4]  =    4;
+          worksetSizeMap[standardKey][5]  =    1;
+          worksetSizeMap[standardKey][6]  =    1;
+          worksetSizeMap[standardKey][7]  =    1;
+          worksetSizeMap[standardKey][8]  =    1;
+          worksetSizeMap[standardKey][9]  =    1;
+          worksetSizeMap[standardKey][10] =    1;
+          
+          worksetSizeMap[nonAffineTensorKey][1]  = 2048;
+          worksetSizeMap[nonAffineTensorKey][2]  = 2048;
+          worksetSizeMap[nonAffineTensorKey][3]  = 128;
+          worksetSizeMap[nonAffineTensorKey][4]  = 16;
+          worksetSizeMap[nonAffineTensorKey][5]  = 2;
+          worksetSizeMap[nonAffineTensorKey][6]  = 1;
+          worksetSizeMap[nonAffineTensorKey][7]  = 1;
+          worksetSizeMap[nonAffineTensorKey][8]  = 1;
+          worksetSizeMap[nonAffineTensorKey][9]  = 1;
+          worksetSizeMap[nonAffineTensorKey][10] = 1;
+           
+          worksetSizeMap[affineTensorKey][1]  = 32768;
+          worksetSizeMap[affineTensorKey][2]  =  8192;
+          worksetSizeMap[affineTensorKey][3]  =   128;
+          worksetSizeMap[affineTensorKey][4]  =     8;
+          worksetSizeMap[affineTensorKey][5]  =     2;
+          worksetSizeMap[affineTensorKey][6]  =     1;
+          worksetSizeMap[affineTensorKey][7]  =     1;
+          worksetSizeMap[affineTensorKey][8]  =     1;
+          worksetSizeMap[affineTensorKey][9]  =     1;
+          worksetSizeMap[affineTensorKey][10] =     1;
+        }
       } // BestSerial case
         break;
       case BestOpenMP_16:
@@ -774,6 +829,48 @@ map<tuple<Mode,FormulationChoice,AlgorithmChoice>,map<int,int> > getWorksetSizeM
           worksetSizeMap[affineTensorKey][7] =    16;
           worksetSizeMap[affineTensorKey][8] =    16;
         }
+        {
+          // VectorWeightedPoisson
+          // These calibrations were run 5-25-24 on an M2 Ultra, on a fork expected to be merged into Trilinos develop soon.
+          FormulationChoice formulation = VectorWeightedPoisson;
+          tuple<Mode,FormulationChoice,AlgorithmChoice> standardKey {mode,formulation,Standard};
+          tuple<Mode,FormulationChoice,AlgorithmChoice> nonAffineTensorKey {mode,formulation,NonAffineTensor};
+          tuple<Mode,FormulationChoice,AlgorithmChoice> affineTensorKey {mode,formulation,AffineTensor};
+          
+          // best for VectorWeightedPoisson - these are for meshes that range from 32,768 for p=1 to 128 for p=10
+          worksetSizeMap[standardKey][1]  = 16384;
+          worksetSizeMap[standardKey][2]  = 16384;
+          worksetSizeMap[standardKey][3]  =  8192;
+          worksetSizeMap[standardKey][4]  =  1024;
+          worksetSizeMap[standardKey][5]  =  1024;
+          worksetSizeMap[standardKey][6]  =  1024;
+          worksetSizeMap[standardKey][7]  =   512;
+          worksetSizeMap[standardKey][8]  =   256;
+          worksetSizeMap[standardKey][9]  =   128;
+          worksetSizeMap[standardKey][10] =    32;
+          
+          worksetSizeMap[nonAffineTensorKey][1]  = 32768;
+          worksetSizeMap[nonAffineTensorKey][2]  =  8192;
+          worksetSizeMap[nonAffineTensorKey][3]  =  8192;
+          worksetSizeMap[nonAffineTensorKey][4]  =  4096;
+          worksetSizeMap[nonAffineTensorKey][5]  =  4096;
+          worksetSizeMap[nonAffineTensorKey][6]  =    64;
+          worksetSizeMap[nonAffineTensorKey][7]  =    32;
+          worksetSizeMap[nonAffineTensorKey][8]  =    32;
+          worksetSizeMap[nonAffineTensorKey][9]  =    16;
+          worksetSizeMap[nonAffineTensorKey][10] =    16;
+           
+          worksetSizeMap[affineTensorKey][1]  = 32768;
+          worksetSizeMap[affineTensorKey][2]  = 16384;
+          worksetSizeMap[affineTensorKey][3]  =  8192;
+          worksetSizeMap[affineTensorKey][4]  =  4096;
+          worksetSizeMap[affineTensorKey][5]  =  4096;
+          worksetSizeMap[affineTensorKey][6]  =  2048;
+          worksetSizeMap[affineTensorKey][7]  =    32;
+          worksetSizeMap[affineTensorKey][8]  =    16;
+          worksetSizeMap[affineTensorKey][9]  =    16;
+          worksetSizeMap[affineTensorKey][10] =    16;
+        }
       } // BestOpenMP_16 case
         break;
       case BestCuda:
@@ -953,6 +1050,23 @@ map<tuple<Mode,FormulationChoice,AlgorithmChoice>,map<int,int> > getWorksetSizeM
           worksetSizeMap[affineTensorKey][7] =  256;
           worksetSizeMap[affineTensorKey][8] =  128;
         } // L^2 formulation
+        {
+          // VectorWeightedPoisson
+          // TODO: set this with some actual calibration result values.  For now, we just borrow from Poisson
+          
+          FormulationChoice formulation = VectorWeightedPoisson;
+          tuple<Mode,FormulationChoice,AlgorithmChoice> standardKey {mode,formulation,Standard};
+          tuple<Mode,FormulationChoice,AlgorithmChoice> nonAffineTensorKey {mode,formulation,NonAffineTensor};
+          tuple<Mode,FormulationChoice,AlgorithmChoice> affineTensorKey {mode,formulation,AffineTensor};
+          
+          tuple<Mode,FormulationChoice,AlgorithmChoice> standardKey_Poisson {mode,Poisson,Standard};
+          tuple<Mode,FormulationChoice,AlgorithmChoice> nonAffineTensorKey_Poisson {mode,Poisson,NonAffineTensor};
+          tuple<Mode,FormulationChoice,AlgorithmChoice> affineTensorKey_Poisson {mode,Poisson,AffineTensor};
+          
+          worksetSizeMap[standardKey]        = worksetSizeMap[standardKey_Poisson];
+          worksetSizeMap[nonAffineTensorKey] = worksetSizeMap[nonAffineTensorKey_Poisson];
+          worksetSizeMap[affineTensorKey]    = worksetSizeMap[affineTensorKey_Poisson];
+        }
     } // BestCuda case
         break;
       case Precalibrated:
@@ -1128,6 +1242,7 @@ int main( int argc, char* argv[] )
       return -1;
     }
     
+    Teuchos::RCP<Kokkos::Array<double,spaceDim>> vectorWeight1, vectorWeight2; // used for VectorWeightedPoisson
     vector<FormulationChoice> formulationChoices;
     if (formulationChoiceString == "All")
     {
@@ -1153,6 +1268,17 @@ int main( int argc, char* argv[] )
     {
       formulationChoices = vector<FormulationChoice>{L2};
     }
+    else if (formulationChoiceString == "VectorWeightedPoisson")
+    {
+      formulationChoices = vector<FormulationChoice>{VectorWeightedPoisson};
+      vectorWeight1 = Teuchos::rcp( new Kokkos::Array<double, spaceDim>() );
+      vectorWeight2 = Teuchos::rcp( new Kokkos::Array<double, spaceDim>() );
+      for (int d=0; d<spaceDim; d++)
+      {
+        (*vectorWeight1)[d] = 1.0;
+        (*vectorWeight2)[d] = 1.0;
+      }
+    }
     else
     {
       cout << "Unrecognized formulation choice: " << formulationChoiceString << endl;
@@ -1387,7 +1513,9 @@ int main( int argc, char* argv[] )
           std::map<AlgorithmChoice, Intrepid2::ScalarView<Scalar,DeviceType> > assembledMatrices;
           for (auto algorithmChoice : algorithmChoices)
           {
-            int worksetSize = worksetSizeMap[algorithmChoice];
+            int worksetSize = 1;
+            if (worksetSizeMap.find(algorithmChoice) != worksetSizeMap.end())
+              worksetSize = worksetSizeMap[algorithmChoice];
             if (mode == Calibration)
             {
               // if this workset size is bigger than the optimal for p-1, skip it -- it's highly
@@ -1428,13 +1556,13 @@ int main( int argc, char* argv[] )
                 case Nodal:
                 {
                   using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
-                  assembledMatrix = performStandardQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                  assembledMatrix = performStandardQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount, vectorWeight1, vectorWeight2);
                 }
                   break;
                 case Hierarchical:
                 {
                   using BasisFamily = HierarchicalBasisFamily<DeviceType>;
-                  assembledMatrix = performStandardQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                  assembledMatrix = performStandardQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount, vectorWeight1, vectorWeight2);
                 }
                   break;
                 case Serendipity:
@@ -1456,13 +1584,13 @@ int main( int argc, char* argv[] )
                 case Nodal:
                 {
                   using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
-                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount, vectorWeight1, vectorWeight2);
                 }
                   break;
                 case Hierarchical:
                 {
                   using BasisFamily = HierarchicalBasisFamily<DeviceType>;
-                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount, vectorWeight1, vectorWeight2);
                 }
                   break;
                 case Serendipity:
@@ -1485,13 +1613,13 @@ int main( int argc, char* argv[] )
                 case Nodal:
                 {
                   using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
-                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount, vectorWeight1, vectorWeight2);
                 }
                   break;
                 case Hierarchical:
                 {
                   using BasisFamily = HierarchicalBasisFamily<DeviceType>;
-                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount, vectorWeight1, vectorWeight2);
                 }
                   break;
                 case Serendipity:
@@ -1520,13 +1648,13 @@ int main( int argc, char* argv[] )
                 case Nodal:
                 {
                   using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
-                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount, vectorWeight1, vectorWeight2);
                 }
                   break;
                 case Hierarchical:
                 {
                   using BasisFamily = HierarchicalBasisFamily<DeviceType>;
-                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount, vectorWeight1, vectorWeight2);
                 }
                   break;
                 case Serendipity:
diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_SimpleUtils.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_SimpleUtils.hpp
index 0ae29a2f50e0..51ff697bde1e 100644
--- a/packages/kokkos-kernels/common/src/KokkosKernels_SimpleUtils.hpp
+++ b/packages/kokkos-kernels/common/src/KokkosKernels_SimpleUtils.hpp
@@ -358,13 +358,19 @@ struct ReduceMaxFunctor {
 };
 
 template <typename view_type, typename MyExecSpace>
-void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce,
+void kk_view_reduce_max(const MyExecSpace &exec, size_t num_elements, view_type view_to_reduce,
                         typename view_type::non_const_value_type &max_reduction) {
-  typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-  Kokkos::parallel_reduce("KokkosKernels::Common::ReduceMax", my_exec_space(0, num_elements),
+  typedef Kokkos::RangePolicy<MyExecSpace> policy_t;
+  Kokkos::parallel_reduce("KokkosKernels::Common::ReduceMax", policy_t(exec, 0, num_elements),
                           ReduceMaxFunctor<view_type>(view_to_reduce), max_reduction);
 }
 
+template <typename view_type, typename MyExecSpace>
+void kk_view_reduce_max(size_t num_elements, view_type view_to_reduce,
+                        typename view_type::non_const_value_type &max_reduction) {
+  kk_view_reduce_max(MyExecSpace(), num_elements, view_to_reduce, max_reduction);
+}
+
 // xorshift hash/pseudorandom function (supported for 32- and 64-bit integer
 // types only)
 template <typename Value>
@@ -429,10 +435,14 @@ struct SequentialFillFunctor {
   val_type start;
 };
 
+template <typename ExecSpace, typename V>
+void sequential_fill(const ExecSpace &exec, const V &v, typename V::non_const_value_type start = 0) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(exec, 0, v.extent(0)), SequentialFillFunctor<V>(v, start));
+}
+
 template <typename V>
 void sequential_fill(const V &v, typename V::non_const_value_type start = 0) {
-  Kokkos::parallel_for(Kokkos::RangePolicy<typename V::execution_space>(0, v.extent(0)),
-                       SequentialFillFunctor<V>(v, start));
+  sequential_fill(typename V::execution_space(), v, start);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp
index a087002d3142..f0add80c50ed 100644
--- a/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp
+++ b/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp
@@ -1076,6 +1076,12 @@ void view_reduce_max(size_t num_elements, view_type view_to_reduce,
   kk_view_reduce_max<view_type, MyExecSpace>(num_elements, view_to_reduce, max_reduction);
 }
 
+template <typename view_type, typename MyExecSpace>
+void view_reduce_max(const MyExecSpace &exec, size_t num_elements, view_type view_to_reduce,
+                     typename view_type::non_const_value_type &max_reduction) {
+  kk_view_reduce_max<view_type, MyExecSpace>(exec, num_elements, view_to_reduce, max_reduction);
+}
+
 template <typename size_type>
 struct ReduceRowSizeFunctor {
   const size_type *rowmap_view_begins;
diff --git a/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt b/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt
index ef0bf7d99530..514ef0ed8253 100644
--- a/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt
+++ b/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt
@@ -116,6 +116,15 @@ KOKKOSKERNELS_ADD_EXECUTABLE(
         SOURCES KokkosSparse_mdf.cpp
 )
 
+# For the sake of build times, don't build this CRS sorting perf test by default.
+# It can be enabled if needed by setting -DKokkosKernels_ENABLE_SORT_CRS_PERFTEST=ON.
+if (KokkosKernels_ENABLE_SORT_CRS_PERFTEST)
+  KOKKOSKERNELS_ADD_EXECUTABLE(
+    sparse_sort_crs
+    SOURCES KokkosSparse_sort_crs.cpp
+)
+endif ()
+
 if (KokkosKernels_ENABLE_BENCHMARK)
   KOKKOSKERNELS_ADD_BENCHMARK(
     sparse_par_ilut
diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sort_crs.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sort_crs.cpp
new file mode 100644
index 000000000000..cd3ed91521d5
--- /dev/null
+++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_sort_crs.cpp
@@ -0,0 +1,103 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <iostream>
+#include <algorithm>
+#include "KokkosKernels_config.h"
+#include "KokkosSparse_IOUtils.hpp"
+#include "KokkosKernels_perf_test_utilities.hpp"
+
+#include "KokkosSparse_CrsMatrix.hpp"
+#include "KokkosSparse_SortCrs.hpp"
+
+using perf_test::CommonInputParams;
+
+struct LocalParams {
+  std::string mtxFile;
+};
+
+void print_options() {
+  std::cerr << "Options\n" << std::endl;
+
+  std::cerr << perf_test::list_common_options();
+
+  std::cerr << "\t[Required] --mtx <path> :: matrix to sort\n";
+  std::cerr << "\t[Optional] --repeat      :: how many times to repeat sorting\n";
+}
+
+int parse_inputs(LocalParams& params, int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    if (perf_test::check_arg_str(i, argc, argv, "--mtx", params.mtxFile)) {
+      ++i;
+    } else {
+      std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl;
+      print_options();
+      return 1;
+    }
+  }
+  return 0;
+}
+
+template <typename exec_space>
+void run_experiment(int argc, char** argv, const CommonInputParams& common_params) {
+  using namespace KokkosSparse;
+
+  using mem_space = typename exec_space::memory_space;
+  using device_t  = typename Kokkos::Device<exec_space, mem_space>;
+  using size_type = default_size_type;
+  using lno_t     = default_lno_t;
+  using scalar_t  = default_scalar;
+  using crsMat_t  = KokkosSparse::CrsMatrix<scalar_t, lno_t, device_t, void, size_type>;
+
+  using graph_t = typename crsMat_t::StaticCrsGraphType;
+
+  LocalParams params;
+  if (parse_inputs(params, argc, argv)) return;
+
+  crsMat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix<crsMat_t>(params.mtxFile.c_str());
+  std::cout << "Loaded matrix: " << A.numRows() << "x" << A.numCols() << " with " << A.nnz() << " entries.\n";
+  // This first sort call serves as a warm-up
+  KokkosSparse::sort_crs_matrix(A);
+  lno_t m          = A.numRows();
+  lno_t n          = A.numCols();
+  auto rowmapHost  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map);
+  auto entriesHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries);
+  typename crsMat_t::index_type shuffledEntries("shuffled entries", A.nnz());
+  // Randomly shuffle the entries within each row, so that the rows aren't
+  // already sorted. Leave the values alone; this changes the matrix numerically
+  // but this doesn't affect sorting.
+  for (lno_t i = 0; i < m; i++) {
+    std::random_shuffle(entriesHost.data() + i, entriesHost.data() + i + 1);
+  }
+  Kokkos::deep_copy(shuffledEntries, entriesHost);
+  exec_space exec;
+  Kokkos::Timer timer;
+  double totalTime = 0;
+  for (int rep = 0; rep < common_params.repeat; rep++) {
+    Kokkos::deep_copy(exec, A.graph.entries, shuffledEntries);
+    exec.fence();
+    timer.reset();
+    KokkosSparse::sort_crs_matrix(exec, A);
+    exec.fence();
+    totalTime += timer.seconds();
+  }
+  std::cout << "Mean sort_crs_matrix time over " << common_params.repeat << " trials: ";
+  std::cout << totalTime / common_params.repeat << "\n";
+}
+
+#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment
+#include "KokkosKernels_perf_test_instantiation.hpp"
+int main(int argc, char** argv) { return main_instantiation(argc, argv); }  // main
diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sort_crs_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sort_crs_impl.hpp
new file mode 100644
index 000000000000..5e18c3fd5ca2
--- /dev/null
+++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sort_crs_impl.hpp
@@ -0,0 +1,366 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#ifndef _KOKKOSSPARSE_SORTCRS_IMPL_HPP
+#define _KOKKOSSPARSE_SORTCRS_IMPL_HPP
+
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Sort.hpp"
+#include "KokkosKernels_Sorting.hpp"
+
+// Workaround for issue with Kokkos::Experimental::sort_by_key, with nvcc and OpenMP enabled
+// (Kokkos issue #7036, fixed in 4.4 release)
+// Once support for Kokkos < 4.4 is dropped,
+// all code inside "ifdef KK_DISABLE_BULK_SORT_BY_KEY" can be deleted.
+#if (KOKKOS_VERSION < 40400) && defined(KOKKOS_ENABLE_CUDA)
+#define KK_DISABLE_BULK_SORT_BY_KEY
+#endif
+
+namespace KokkosSparse {
+namespace Impl {
+
+template <typename rowmap_t, typename entries_t, typename values_t>
+struct MatrixRadixSortFunctor {
+  using Offset          = typename rowmap_t::non_const_value_type;
+  using Ordinal         = typename entries_t::non_const_value_type;
+  using UnsignedOrdinal = typename std::make_unsigned<Ordinal>::type;
+  using Scalar          = typename values_t::non_const_value_type;
+  // The functor owns memory for entriesAux, so it can't have
+  // MemoryTraits<Unmanaged>
+  using entries_managed_t = Kokkos::View<typename entries_t::data_type, typename entries_t::device_type>;
+  using values_managed_t  = Kokkos::View<typename values_t::data_type, typename values_t::device_type>;
+
+  MatrixRadixSortFunctor(const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_)
+      : rowmap(rowmap_), entries(entries_), values(values_) {
+    entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0));
+    valuesAux  = values_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"), values.extent(0));
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(Ordinal i) const {
+    Offset rowStart = rowmap(i);
+    Offset rowEnd   = rowmap(i + 1);
+    Ordinal rowNum  = rowEnd - rowStart;
+    // Radix sort requires unsigned keys for comparison
+    KokkosKernels::SerialRadixSort2<Ordinal, UnsignedOrdinal, Scalar>(
+        (UnsignedOrdinal*)entries.data() + rowStart, (UnsignedOrdinal*)entriesAux.data() + rowStart,
+        values.data() + rowStart, valuesAux.data() + rowStart, rowNum);
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  entries_managed_t entriesAux;
+  values_t values;
+  values_managed_t valuesAux;
+};
+
+template <typename Policy, typename Ordinal, typename rowmap_t, typename entries_t, typename values_t>
+struct MatrixThreadSortFunctor {
+  using Offset = typename rowmap_t::non_const_value_type;
+
+  MatrixThreadSortFunctor(Ordinal numRows_, const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_)
+      : numRows(numRows_), rowmap(rowmap_), entries(entries_), values(values_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const typename Policy::member_type& t) const {
+    Ordinal i = t.league_rank() * t.team_size() + t.team_rank();
+    if (i >= numRows) return;
+    Offset rowStart = rowmap(i);
+    Offset rowEnd   = rowmap(i + 1);
+    auto rowEntries = Kokkos::subview(entries, Kokkos::make_pair(rowStart, rowEnd));
+    auto rowValues  = Kokkos::subview(values, Kokkos::make_pair(rowStart, rowEnd));
+    Kokkos::Experimental::sort_by_key_thread(t, rowEntries, rowValues);
+  }
+
+  Ordinal numRows;
+  rowmap_t rowmap;
+  entries_t entries;
+  values_t values;
+};
+
+template <typename rowmap_t, typename entries_t>
+struct GraphRadixSortFunctor {
+  using Offset          = typename rowmap_t::non_const_value_type;
+  using Ordinal         = typename entries_t::non_const_value_type;
+  using UnsignedOrdinal = typename std::make_unsigned<Ordinal>::type;
+  // The functor owns memory for entriesAux, so it can't have
+  // MemoryTraits<Unmanaged>
+  using entries_managed_t = Kokkos::View<typename entries_t::data_type, typename entries_t::device_type>;
+
+  GraphRadixSortFunctor(const rowmap_t& rowmap_, const entries_t& entries_) : rowmap(rowmap_), entries(entries_) {
+    entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0));
+  }
+
+  KOKKOS_INLINE_FUNCTION void operator()(Ordinal i) const {
+    Offset rowStart = rowmap(i);
+    Offset rowEnd   = rowmap(i + 1);
+    Ordinal rowNum  = rowEnd - rowStart;
+    // Radix sort requires unsigned keys for comparison
+    KokkosKernels::SerialRadixSort<Ordinal, UnsignedOrdinal>((UnsignedOrdinal*)entries.data() + rowStart,
+                                                             (UnsignedOrdinal*)entriesAux.data() + rowStart, rowNum);
+  }
+
+  rowmap_t rowmap;
+  entries_t entries;
+  entries_managed_t entriesAux;
+};
+
+template <typename Policy, typename Ordinal, typename rowmap_t, typename entries_t>
+struct GraphThreadSortFunctor {
+  using Offset = typename rowmap_t::non_const_value_type;
+
+  GraphThreadSortFunctor(Ordinal numRows_, const rowmap_t& rowmap_, const entries_t& entries_)
+      : numRows(numRows_), rowmap(rowmap_), entries(entries_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(const typename Policy::member_type& t) const {
+    Ordinal i = t.league_rank() * t.team_size() + t.team_rank();
+    if (i >= numRows) return;
+    Offset rowStart = rowmap(i);
+    Offset rowEnd   = rowmap(i + 1);
+    auto rowEntries = Kokkos::subview(entries, Kokkos::make_pair(rowStart, rowEnd));
+    Kokkos::Experimental::sort_thread(t, rowEntries);
+  }
+
+  Ordinal numRows;
+  rowmap_t rowmap;
+  entries_t entries;
+};
+
+template <typename rowmap_t, typename entries_t>
+struct MergedRowmapFunctor {
+  using size_type  = typename rowmap_t::non_const_value_type;
+  using lno_t      = typename entries_t::non_const_value_type;
+  using c_rowmap_t = typename rowmap_t::const_type;
+
+  // Precondition: entries are sorted within each row
+  MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, const entries_t& entries_)
+      : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const {
+    size_type rowBegin = rowmap(row);
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with
+      mergedCounts(row) = 0;
+      return;
+    }
+    // Otherwise, the first entry in the row exists
+    lno_t uniqueEntries = 1;
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (entries(j - 1) != entries(j)) uniqueEntries++;
+    }
+    mergedCounts(row) = uniqueEntries;
+    lnewNNZ += uniqueEntries;
+    if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0;
+  }
+
+  rowmap_t mergedCounts;
+  c_rowmap_t rowmap;
+  entries_t entries;
+};
+
+template <typename rowmap_t, typename entries_t, typename values_t>
+struct MatrixMergedEntriesFunctor {
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t     = typename entries_t::non_const_value_type;
+  using scalar_t  = typename values_t::non_const_value_type;
+
+  // Precondition: entries are sorted within each row
+  MatrixMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, const entries_t& entries_,
+                             const values_t& values_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_,
+                             const values_t& mergedValues_)
+      : rowmap(rowmap_),
+        entries(entries_),
+        values(values_),
+        mergedRowmap(mergedRowmap_),
+        mergedEntries(mergedEntries_),
+        mergedValues(mergedValues_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
+    size_type rowBegin = rowmap(row);
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with, nothing to do
+      return;
+    }
+    // Otherwise, accumulate the value for each column
+    scalar_t accumVal   = values(rowBegin);
+    lno_t accumCol      = entries(rowBegin);
+    size_type insertPos = mergedRowmap(row);
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (accumCol == entries(j)) {
+        // accumulate
+        accumVal += values(j);
+      } else {
+        // write out and reset
+        mergedValues(insertPos)  = accumVal;
+        mergedEntries(insertPos) = accumCol;
+        insertPos++;
+        accumVal = values(j);
+        accumCol = entries(j);
+      }
+    }
+    // always left with the last unique entry
+    mergedValues(insertPos)  = accumVal;
+    mergedEntries(insertPos) = accumCol;
+  }
+
+  typename rowmap_t::const_type rowmap;
+  entries_t entries;
+  values_t values;
+  rowmap_t mergedRowmap;
+  entries_t mergedEntries;
+  values_t mergedValues;
+};
+
+template <typename rowmap_t, typename entries_t>
+struct GraphMergedEntriesFunctor {
+  using size_type = typename rowmap_t::non_const_value_type;
+  using lno_t     = typename entries_t::non_const_value_type;
+
+  // Precondition: entries are sorted within each row
+  GraphMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, const entries_t& entries_,
+                            const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_)
+      : rowmap(rowmap_), entries(entries_), mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_) {}
+
+  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
+    size_type rowBegin = rowmap(row);
+    size_type rowEnd   = rowmap(row + 1);
+    if (rowEnd == rowBegin) {
+      // Row was empty to begin with, nothing to do
+      return;
+    }
+    // Otherwise, accumulate the value for each column
+    lno_t accumCol      = entries(rowBegin);
+    size_type insertPos = mergedRowmap(row);
+    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
+      if (accumCol != entries(j)) {
+        // write out and reset
+        mergedEntries(insertPos) = accumCol;
+        insertPos++;
+        accumCol = entries(j);
+      }
+    }
+    // always left with the last unique entry
+    mergedEntries(insertPos) = accumCol;
+  }
+
+  typename rowmap_t::const_type rowmap;
+  entries_t entries;
+  rowmap_t mergedRowmap;
+  entries_t mergedEntries;
+};
+
+template <typename Offset, typename Keys, typename Entries>
+struct MaxScanFunctor {
+  using value_type = uint64_t;
+
+  MaxScanFunctor(uint64_t ncols_, const Keys& keys_, const Entries& entries_)
+      : ncols(ncols_), keys(keys_), entries(entries_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(uint64_t& update) const { update = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(uint64_t& update, const uint64_t& input) const { update = Kokkos::max(update, input); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Offset i, uint64_t& lmax, bool finalPass) const {
+    lmax = Kokkos::max(lmax, keys(i));
+    if (finalPass) {
+      // lmax is the row containing entry i.
+      // The key is equivalent to the entry's linear
+      // index if the matrix were dense and row-major.
+      keys(i) = lmax * ncols + entries(i);
+    }
+  }
+
+  uint64_t ncols;
+  Keys keys;
+  Entries entries;
+};
+
+template <typename ExecSpace, typename Rowmap, typename Entries>
+Kokkos::View<uint64_t*, ExecSpace> generateBulkCrsKeys(const ExecSpace& exec, const Rowmap& rowmap,
+                                                       const Entries& entries,
+                                                       typename Entries::non_const_value_type ncols) {
+  using Offset    = typename Rowmap::non_const_value_type;
+  using Ordinal   = typename Entries::non_const_value_type;
+  Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  Kokkos::View<uint64_t*, ExecSpace> keys("keys", entries.extent(0));
+  Kokkos::parallel_for(
+      "CRS bulk sorting: mark row begins", Kokkos::RangePolicy<ExecSpace>(exec, 0, numRows), KOKKOS_LAMBDA(Ordinal i) {
+        Offset rowBegin = rowmap(i);
+        // Only mark the beginnings of non-empty rows.
+        // Otherwise multiple rows could try to update the same key.
+        if (rowmap(i + 1) != rowBegin) {
+          keys(rowBegin) = uint64_t(i);
+        }
+      });
+  Kokkos::fence();
+  Kokkos::parallel_scan("CRS bulk sorting: compute keys", Kokkos::RangePolicy<ExecSpace>(exec, 0, entries.extent(0)),
+                        MaxScanFunctor<Offset, decltype(keys), Entries>(ncols, keys, entries));
+  Kokkos::fence();
+  return keys;
+}
+
+#ifndef KK_DISABLE_BULK_SORT_BY_KEY
+template <typename ExecSpace, typename Rowmap, typename Entries>
+Kokkos::View<typename Rowmap::non_const_value_type*, ExecSpace> computeEntryPermutation(
+    const ExecSpace& exec, const Rowmap& rowmap, const Entries& entries, typename Entries::non_const_value_type ncols) {
+  using Offset = typename Rowmap::non_const_value_type;
+  auto keys    = generateBulkCrsKeys(exec, rowmap, entries, ncols);
+  Kokkos::View<Offset*, ExecSpace> permutation(Kokkos::view_alloc(Kokkos::WithoutInitializing, "permutation"),
+                                               entries.extent(0));
+  // This initializes permutation as the identity
+  KokkosKernels::Impl::sequential_fill(exec, permutation);
+  Kokkos::Experimental::sort_by_key(exec, keys, permutation);
+  return permutation;
+}
+
+// Heuristic for choosing bulk sorting algorithm
+template <typename Ordinal>
+bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) {
+  // Use bulk sort if matrix is highly imbalanced,
+  // OR the longest rows have many entries.
+  return (maxDeg / 10 > avgDeg) || (maxDeg > 1024);
+}
+#endif
+
+template <typename ExecSpace, typename Permutation, typename InView, typename OutView>
+void applyPermutation(const ExecSpace& exec, const Permutation& permutation, const InView& in, const OutView& out) {
+  Kokkos::parallel_for(
+      "CRS bulk sorting: permute", Kokkos::RangePolicy<ExecSpace>(exec, 0, in.extent(0)),
+      KOKKOS_LAMBDA(size_t i) { out(i) = in(permutation(i)); });
+}
+
+template <typename ExecSpace, typename Permutation, typename InView, typename OutView, typename Ordinal>
+void applyPermutationBlockValues(const ExecSpace& exec, const Permutation& permutation, const InView& in,
+                                 const OutView& out, Ordinal blockSize) {
+  uint64_t scalarsPerBlock = (uint64_t)blockSize * blockSize;
+  if (in.extent(0) % scalarsPerBlock)
+    throw std::invalid_argument(
+        "sort_bsr_matrix: matrix values extent not divisible by graph entries "
+        "extent");
+  Kokkos::parallel_for(
+      "BSR bulk sorting: permute", Kokkos::RangePolicy<ExecSpace>(exec, 0, in.extent(0)), KOKKOS_LAMBDA(size_t i) {
+        uint64_t blockIndex    = i / scalarsPerBlock;
+        uint64_t offsetInBlock = i % scalarsPerBlock;
+        out(i)                 = in(permutation(blockIndex) * scalarsPerBlock + offsetInBlock);
+      });
+}
+
+}  // namespace Impl
+}  // namespace KokkosSparse
+
+#endif
diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_SortCrs.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_SortCrs.hpp
index 455068b56f43..1203cd244b5b 100644
--- a/packages/kokkos-kernels/sparse/src/KokkosSparse_SortCrs.hpp
+++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_SortCrs.hpp
@@ -16,38 +16,11 @@
 #ifndef _KOKKOSSPARSE_SORTCRS_HPP
 #define _KOKKOSSPARSE_SORTCRS_HPP
 
-#include "Kokkos_Core.hpp"
-#include "KokkosKernels_Sorting.hpp"
+#include "KokkosSparse_sort_crs_impl.hpp"
+#include "KokkosSparse_Utils.hpp"
 
 namespace KokkosSparse {
 
-// ----------------------------------
-// BSR matrix/graph sorting utilities
-// ----------------------------------
-
-// Sort a BRS matrix: within each row, sort entries ascending by column and
-// permute the values accordingly.
-template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t,
-          typename lno_t = typename entries_t::non_const_value_type>
-void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const entries_t& entries, const values_t& values);
-
-// Sort a BRS matrix on the given execution space instance: within each row,
-// sort entries ascending by column and permute the values accordingly.
-template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t,
-          typename lno_t = typename entries_t::non_const_value_type>
-void sort_bsr_matrix(const execution_space& exec, const lno_t blockdim, const rowmap_t& rowmap,
-                     const entries_t& entries, const values_t& values);
-
-// Sort a BRS matrix: within each row, sort entries ascending by column and
-// permute the values accordingly.
-template <typename bsrMat_t>
-void sort_bsr_matrix(const bsrMat_t& A);
-
-// Sort a BRS matrix on the given execution space instance: within each row,
-// sort entries ascending by column and permute the values accordingly.
-template <typename bsrMat_t>
-void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, const bsrMat_t& A);
-
 // ----------------------------------
 // CRS matrix/graph sorting utilities
 // ----------------------------------
@@ -63,269 +36,13 @@ void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, const bsrMa
 // duplicated entries in A, A is sorted and returned (instead of a newly
 // allocated matrix).
 
-namespace Impl {
-
-template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
-struct SortCrsMatrixFunctor {
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t     = typename entries_t::non_const_value_type;
-  using scalar_t  = typename values_t::non_const_value_type;
-  using team_mem  = typename Kokkos::TeamPolicy<execution_space>::member_type;
-  // The functor owns memory for entriesAux, so it can't have
-  // MemoryTraits<Unmanaged>
-  using entries_managed_t = Kokkos::View<typename entries_t::data_type, typename entries_t::device_type>;
-  using values_managed_t  = Kokkos::View<typename values_t::data_type, typename values_t::device_type>;
-
-  SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_)
-      : rowmap(rowmap_), entries(entries_), values(values_) {
-    if (usingRangePol) {
-      entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0));
-      valuesAux  = values_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"), values.extent(0));
-    }
-    // otherwise, aux arrays won't be allocated (sorting in place)
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
-    size_type rowStart = rowmap(i);
-    size_type rowEnd   = rowmap(i + 1);
-    lno_t rowNum       = rowEnd - rowStart;
-    // Radix sort requires unsigned keys for comparison
-    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
-    KokkosKernels::SerialRadixSort2<lno_t, unsigned_lno_t, scalar_t>(
-        (unsigned_lno_t*)entries.data() + rowStart, (unsigned_lno_t*)entriesAux.data() + rowStart,
-        values.data() + rowStart, valuesAux.data() + rowStart, rowNum);
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const {
-    size_type i        = t.league_rank();
-    size_type rowStart = rowmap(i);
-    size_type rowEnd   = rowmap(i + 1);
-    lno_t rowNum       = rowEnd - rowStart;
-    KokkosKernels::TeamBitonicSort2<lno_t, lno_t, scalar_t, team_mem>(entries.data() + rowStart,
-                                                                      values.data() + rowStart, rowNum, t);
-  }
-
-  rowmap_t rowmap;
-  entries_t entries;
-  entries_managed_t entriesAux;
-  values_t values;
-  values_managed_t valuesAux;
-};
-
-template <typename execution_space, typename rowmap_t, typename entries_t>
-struct SortCrsGraphFunctor {
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t     = typename entries_t::non_const_value_type;
-  using team_mem  = typename Kokkos::TeamPolicy<execution_space>::member_type;
-  // The functor owns memory for entriesAux, so it can't have
-  // MemoryTraits<Unmanaged>
-  using entries_managed_t = Kokkos::View<typename entries_t::data_type, typename entries_t::device_type>;
-
-  SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_)
-      : rowmap(rowmap_), entries(entries_) {
-    if (usingRangePol) {
-      entriesAux = entries_managed_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0));
-    }
-    // otherwise, aux arrays won't be allocated (sorting in place)
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const {
-    size_type rowStart = rowmap(i);
-    size_type rowEnd   = rowmap(i + 1);
-    lno_t rowNum       = rowEnd - rowStart;
-    // Radix sort requires unsigned keys for comparison
-    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
-    KokkosKernels::SerialRadixSort<lno_t, unsigned_lno_t>((unsigned_lno_t*)entries.data() + rowStart,
-                                                          (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum);
-  }
-
-  KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const {
-    size_type i        = t.league_rank();
-    size_type rowStart = rowmap(i);
-    size_type rowEnd   = rowmap(i + 1);
-    lno_t rowNum       = rowEnd - rowStart;
-    KokkosKernels::TeamBitonicSort<lno_t, lno_t, team_mem>(entries.data() + rowStart, rowNum, t);
-  }
-
-  rowmap_t rowmap;
-  entries_t entries;
-  entries_managed_t entriesAux;
-};
-
-template <typename rowmap_t, typename entries_t>
-struct MergedRowmapFunctor {
-  using size_type  = typename rowmap_t::non_const_value_type;
-  using lno_t      = typename entries_t::non_const_value_type;
-  using c_rowmap_t = typename rowmap_t::const_type;
-
-  // Precondition: entries are sorted within each row
-  MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, const entries_t& entries_)
-      : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const {
-    size_type rowBegin = rowmap(row);
-    size_type rowEnd   = rowmap(row + 1);
-    if (rowEnd == rowBegin) {
-      // Row was empty to begin with
-      mergedCounts(row) = 0;
-      return;
-    }
-    // Otherwise, the first entry in the row exists
-    lno_t uniqueEntries = 1;
-    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
-      if (entries(j - 1) != entries(j)) uniqueEntries++;
-    }
-    mergedCounts(row) = uniqueEntries;
-    lnewNNZ += uniqueEntries;
-    if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0;
-  }
-
-  rowmap_t mergedCounts;
-  c_rowmap_t rowmap;
-  entries_t entries;
-};
-
-template <typename rowmap_t, typename entries_t, typename values_t>
-struct MatrixMergedEntriesFunctor {
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t     = typename entries_t::non_const_value_type;
-  using scalar_t  = typename values_t::non_const_value_type;
-
-  // Precondition: entries are sorted within each row
-  MatrixMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, const entries_t& entries_,
-                             const values_t& values_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_,
-                             const values_t& mergedValues_)
-      : rowmap(rowmap_),
-        entries(entries_),
-        values(values_),
-        mergedRowmap(mergedRowmap_),
-        mergedEntries(mergedEntries_),
-        mergedValues(mergedValues_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
-    size_type rowBegin = rowmap(row);
-    size_type rowEnd   = rowmap(row + 1);
-    if (rowEnd == rowBegin) {
-      // Row was empty to begin with, nothing to do
-      return;
-    }
-    // Otherwise, accumulate the value for each column
-    scalar_t accumVal   = values(rowBegin);
-    lno_t accumCol      = entries(rowBegin);
-    size_type insertPos = mergedRowmap(row);
-    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
-      if (accumCol == entries(j)) {
-        // accumulate
-        accumVal += values(j);
-      } else {
-        // write out and reset
-        mergedValues(insertPos)  = accumVal;
-        mergedEntries(insertPos) = accumCol;
-        insertPos++;
-        accumVal = values(j);
-        accumCol = entries(j);
-      }
-    }
-    // always left with the last unique entry
-    mergedValues(insertPos)  = accumVal;
-    mergedEntries(insertPos) = accumCol;
-  }
-
-  typename rowmap_t::const_type rowmap;
-  entries_t entries;
-  values_t values;
-  rowmap_t mergedRowmap;
-  entries_t mergedEntries;
-  values_t mergedValues;
-};
-
-template <typename rowmap_t, typename entries_t>
-struct GraphMergedEntriesFunctor {
-  using size_type = typename rowmap_t::non_const_value_type;
-  using lno_t     = typename entries_t::non_const_value_type;
-
-  // Precondition: entries are sorted within each row
-  GraphMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, const entries_t& entries_,
-                            const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_)
-      : rowmap(rowmap_), entries(entries_), mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_) {}
-
-  KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const {
-    size_type rowBegin = rowmap(row);
-    size_type rowEnd   = rowmap(row + 1);
-    if (rowEnd == rowBegin) {
-      // Row was empty to begin with, nothing to do
-      return;
-    }
-    // Otherwise, accumulate the value for each column
-    lno_t accumCol      = entries(rowBegin);
-    size_type insertPos = mergedRowmap(row);
-    for (size_type j = rowBegin + 1; j < rowEnd; j++) {
-      if (accumCol != entries(j)) {
-        // write out and reset
-        mergedEntries(insertPos) = accumCol;
-        insertPos++;
-        accumCol = entries(j);
-      }
-    }
-    // always left with the last unique entry
-    mergedEntries(insertPos) = accumCol;
-  }
-
-  typename rowmap_t::const_type rowmap;
-  entries_t entries;
-  rowmap_t mergedRowmap;
-  entries_t mergedEntries;
-};
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) {
-  T t = a;
-  a   = b;
-  b   = t;
-}
-
-template <typename row_map_type, typename entries_type, typename values_type>
-struct sort_bsr_functor {
-  using lno_t = typename entries_type::non_const_value_type;
-
-  row_map_type rowmap;
-  entries_type entries;
-  values_type values;
-  const lno_t blocksize;
-
-  sort_bsr_functor(row_map_type rowmap_, entries_type entries_, values_type values_, const lno_t blocksize_)
-      : rowmap(rowmap_), entries(entries_), values(values_), blocksize(blocksize_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const lno_t i) const {
-    const lno_t rowStart = rowmap(i);
-    const lno_t rowSize  = rowmap(i + 1) - rowStart;
-    auto* e              = entries.data() + rowStart;
-    auto* v              = values.data() + rowStart * blocksize;
-    bool done            = false;
-    while (!done) {
-      done = true;
-      for (lno_t j = 1; j < rowSize; ++j) {
-        const lno_t jp = j - 1;
-        if (e[jp] <= e[j]) continue;
-        Impl::kk_swap(e[jp], e[j]);
-        auto const vb  = v + j * blocksize;
-        auto const vbp = v + jp * blocksize;
-        for (lno_t k = 0; k < blocksize; ++k)  // std::swap_ranges(vb, vb + blocksize, vbp);
-          Impl::kk_swap(vb[k], vbp[k]);
-        done = false;
-      }
-    }
-  }
-};
-
-}  // namespace Impl
-
 // Sort a CRS matrix: within each row, sort entries ascending by column.
 // At the same time, permute the values.
 template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
 void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const entries_t& entries,
-                     const values_t& values) {
+                     const values_t& values,
+                     typename entries_t::non_const_value_type numCols =
+                         Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
   static_assert(Kokkos::SpaceAccessibility<execution_space, typename rowmap_t::memory_space>::accessible,
                 "sort_crs_matrix: rowmap_t is not accessible from the given execution "
                 "space");
@@ -338,71 +55,156 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const
   static_assert(!std::is_const_v<typename entries_t::value_type>,
                 "sort_crs_matrix: entries_t must not be const-valued");
   static_assert(!std::is_const_v<typename values_t::value_type>, "sort_crs_matrix: value_t must not be const-valued");
-  using lno_t    = typename entries_t::non_const_value_type;
-  using team_pol = Kokkos::TeamPolicy<execution_space>;
-  bool useRadix  = !KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
-  lno_t numRows  = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
-  if (numRows == 0) return;
-  Impl::SortCrsMatrixFunctor<execution_space, rowmap_t, entries_t, values_t> funct(useRadix, rowmap, entries, values);
-  if (useRadix) {
-    Kokkos::parallel_for("sort_crs_matrix", Kokkos::RangePolicy<execution_space>(exec, 0, numRows), funct);
+  using Ordinal = typename entries_t::non_const_value_type;
+  // This early return condition covers having 0 or 1 entries,
+  // which is also implied by having 0 rows or 0 columns.
+  // If only 1 entry, the matrix is already sorted.
+  if (entries.extent(0) <= size_t(1)) {
+    return;
+  }
+  Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>()) {
+    // On CPUs, use a sequential radix sort within each row.
+    Kokkos::parallel_for("sort_crs_matrix[CPU,radix]",
+                         Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>(exec, 0, numRows),
+                         Impl::MatrixRadixSortFunctor<rowmap_t, entries_t, values_t>(rowmap, entries, values));
   } else {
-    // Try to get teamsize to be largest power of 2 not greater than avg entries
-    // per row
-    // TODO (probably important for performnce): add thread-level sort also, and
-    // use that for small avg degree. But this works for now.
-    lno_t idealTeamSize = 1;
-    lno_t avgDeg        = (entries.extent(0) + numRows - 1) / numRows;
-    while (idealTeamSize < avgDeg / 2) {
-      idealTeamSize *= 2;
+    // On GPUs:
+    //   If the matrix is highly imbalanced, or has long rows AND the dimensions
+    //   are not too large to do one large bulk sort, do that. Otherwise, sort
+    //   using one Kokkos thread per row.
+    Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
+#ifndef KK_DISABLE_BULK_SORT_BY_KEY
+    Ordinal maxDeg   = KokkosSparse::Impl::graph_max_degree(exec, rowmap);
+    bool useBulkSort = false;
+    if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) {
+      // Calculate the true number of columns if user didn't pass it in
+      if (numCols == Kokkos::ArithTraits<Ordinal>::max()) {
+        KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, numCols);
+        numCols++;
+      }
+      uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols;
+      useBulkSort         = maxBulkKey / numRows == (uint64_t)numCols;
+    }
+    if (useBulkSort) {
+      auto permutation = KokkosSparse::Impl::computeEntryPermutation(exec, rowmap, entries, numCols);
+      // Permutations cannot be done in-place
+      Kokkos::View<typename values_t::value_type*, execution_space> origValues(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"), values.extent(0));
+      Kokkos::View<typename entries_t::value_type*, execution_space> origEntries(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "origEntries"), entries.extent(0));
+      Kokkos::deep_copy(exec, origValues, values);
+      Kokkos::deep_copy(exec, origEntries, entries);
+      KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries, entries);
+      KokkosSparse::Impl::applyPermutation(exec, permutation, origValues, values);
+    } else
+#else
+    (void)numCols;
+#endif
+    {
+      using TeamPol = Kokkos::TeamPolicy<execution_space>;
+      // Can't use bulk sort approach as matrix dimensions are too large.
+      // Fall back to parallel thread-level sort within each row.
+      Ordinal vectorLength = 1;
+      while (vectorLength < avgDeg / 2) {
+        vectorLength *= 2;
+      }
+      if (vectorLength > TeamPol ::vector_length_max()) vectorLength = TeamPol ::vector_length_max();
+      Impl::MatrixThreadSortFunctor<TeamPol, Ordinal, rowmap_t, entries_t, values_t> funct(numRows, rowmap, entries,
+                                                                                           values);
+      Ordinal teamSize = TeamPol(exec, 1, 1, vectorLength).team_size_recommended(funct, Kokkos::ParallelForTag());
+      Kokkos::parallel_for("sort_crs_matrix[GPU,bitonic]",
+                           TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct);
     }
-    team_pol temp(exec, numRows, 1);
-    lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag());
-    lno_t teamSize    = std::min(idealTeamSize, maxTeamSize);
-    Kokkos::parallel_for("sort_crs_matrix", team_pol(exec, numRows, teamSize), funct);
   }
 }
 
 template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
-void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) {
-  sort_crs_matrix(execution_space(), rowmap, entries, values);
+void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values,
+                     typename entries_t::const_value_type numCols =
+                         Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  sort_crs_matrix(execution_space(), rowmap, entries, values, numCols);
 }
 
 template <typename rowmap_t, typename entries_t, typename values_t>
-void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) {
-  sort_crs_matrix(typename entries_t::execution_space(), rowmap, entries, values);
+void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values,
+                     typename entries_t::const_value_type numCols =
+                         Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  sort_crs_matrix(typename entries_t::execution_space(), rowmap, entries, values, numCols);
 }
 
 template <typename crsMat_t>
 void sort_crs_matrix(const typename crsMat_t::execution_space& exec, const crsMat_t& A) {
-  sort_crs_matrix(exec, A.graph.row_map, A.graph.entries, A.values);
+  sort_crs_matrix(exec, A.graph.row_map, A.graph.entries, A.values, A.numCols());
 }
 
 template <typename crsMat_t>
 void sort_crs_matrix(const crsMat_t& A) {
-  sort_crs_matrix(typename crsMat_t::execution_space(), A.graph.row_map, A.graph.entries, A.values);
+  sort_crs_matrix(typename crsMat_t::execution_space(), A.graph.row_map, A.graph.entries, A.values, A.numCols());
 }
 
 // Sort a BRS matrix: within each row, sort entries ascending by column and
 // permute the values accordingly.
-template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t, typename lno_t>
-void sort_bsr_matrix(const execution_space& exec, const lno_t blockdim, const rowmap_t& rowmap,
-                     const entries_t& entries, const values_t& values) {
-  // TODO: this is O(N^2) mock for debugging - do regular implementation based
-  // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general
-  // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ?
-  lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
-  if (numRows == 0) return;
-  const lno_t blocksize = blockdim * blockdim;
-
-  assert(values.extent(0) == entries.extent(0) * blocksize);
-  Impl::sort_bsr_functor<rowmap_t, entries_t, values_t> bsr_sorter(rowmap, entries, values, blocksize);
-  Kokkos::parallel_for("sort_bsr_matrix", Kokkos::RangePolicy<execution_space>(exec, 0, numRows), bsr_sorter);
-}
-
-template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t, typename lno_t>
-void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const entries_t& entries, const values_t& values) {
-  sort_bsr_matrix(execution_space(), blockdim, rowmap, entries, values);
+template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t,
+          typename Ordinal = typename entries_t::non_const_value_type>
+void sort_bsr_matrix(const execution_space& exec, Ordinal blockSize, const rowmap_t& rowmap, const entries_t& entries,
+                     const values_t& values,
+                     typename entries_t::non_const_value_type numCols =
+                         Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  static_assert(std::is_same_v<Ordinal, typename entries_t::non_const_value_type>,
+                "sort_bsr_matrix: Ordinal type must match nonconst value type of "
+                "entries_t (default template parameter)");
+  if (entries.extent(0) <= size_t(1)) {
+    return;
+  }
+  Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  if (numCols == Kokkos::ArithTraits<Ordinal>::max()) {
+    KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, numCols);
+    numCols++;
+  }
+  uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols;
+  if (maxBulkKey / numRows != (uint64_t)numCols)
+    throw std::invalid_argument(
+        "sort_bsr_matrix: implementation requires that numRows * numCols is "
+        "representable in uint64_t");
+#ifdef KK_DISABLE_BULK_SORT_BY_KEY
+  using TeamPol = Kokkos::TeamPolicy<execution_space>;
+  using Offset  = typename rowmap_t::non_const_value_type;
+  // Temporary workaround: do not use Kokkos::Experimental::sort_by_key, instead
+  // sort bulk keys one row at a time
+  auto keys = Impl::generateBulkCrsKeys(exec, rowmap, entries, numCols);
+  Kokkos::View<Offset*, execution_space> permutation(Kokkos::view_alloc(Kokkos::WithoutInitializing, "permutation"),
+                                                     entries.extent(0));
+  KokkosKernels::Impl::sequential_fill(exec, permutation);
+  Ordinal vectorLength = 1;
+  Ordinal avgDeg       = (entries.extent(0) + numRows - 1) / numRows;
+  while (vectorLength < avgDeg / 2) {
+    vectorLength *= 2;
+  }
+  if (vectorLength > TeamPol ::vector_length_max()) vectorLength = TeamPol ::vector_length_max();
+  Impl::MatrixThreadSortFunctor<TeamPol, Ordinal, rowmap_t, decltype(keys), decltype(permutation)> funct(
+      numRows, rowmap, keys, permutation);
+  Ordinal teamSize = TeamPol(exec, 1, 1, vectorLength).team_size_recommended(funct, Kokkos::ParallelForTag());
+  Kokkos::parallel_for("sort_bulk_keys_by_row[GPU,bitonic]",
+                       TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct);
+#else
+  auto permutation = KokkosSparse::Impl::computeEntryPermutation(exec, rowmap, entries, numCols);
+#endif
+  // Permutations cannot be done in-place
+  Kokkos::View<typename values_t::value_type*, execution_space> origValues(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"), values.extent(0));
+  Kokkos::View<typename entries_t::value_type*, execution_space> origEntries(
+      Kokkos::view_alloc(Kokkos::WithoutInitializing, "origEntries"), entries.extent(0));
+  Kokkos::deep_copy(exec, origValues, values);
+  Kokkos::deep_copy(exec, origEntries, entries);
+  KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries, entries);
+  KokkosSparse::Impl::applyPermutationBlockValues(exec, permutation, origValues, values, blockSize);
+}
+
+template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t, typename Ordinal>
+void sort_bsr_matrix(Ordinal blockdim, const rowmap_t& rowmap, const entries_t& entries, const values_t& values,
+                     Ordinal numCols = Kokkos::ArithTraits<Ordinal>::max()) {
+  sort_bsr_matrix(execution_space(), blockdim, rowmap, entries, values, numCols);
 }
 
 // Sort a BSR matrix (like CRS but single values are replaced with contignous
@@ -413,7 +215,7 @@ void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, const bsrMa
   // directly
   sort_bsr_matrix<typename bsrMat_t::execution_space, typename bsrMat_t::row_map_type,
                   typename bsrMat_t::index_type::non_const_type, typename bsrMat_t::values_type::non_const_type>(
-      exec, A.blockDim(), A.graph.row_map, A.graph.entries, A.values);
+      exec, A.blockDim(), A.graph.row_map, A.graph.entries, A.values, A.numCols());
 }
 
 template <typename bsrMat_t>
@@ -423,9 +225,10 @@ void sort_bsr_matrix(const bsrMat_t& A) {
 
 // Sort a CRS graph: within each row, sort entries ascending by column.
 template <typename execution_space, typename rowmap_t, typename entries_t>
-void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const entries_t& entries) {
-  using lno_t    = typename entries_t::non_const_value_type;
-  using team_pol = Kokkos::TeamPolicy<execution_space>;
+void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const entries_t& entries,
+                    typename entries_t::non_const_value_type numCols =
+                        Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  using Ordinal = typename entries_t::non_const_value_type;
   static_assert(Kokkos::SpaceAccessibility<execution_space, typename rowmap_t::memory_space>::accessible,
                 "sort_crs_graph: rowmap_t is not accessible from the given execution "
                 "space");
@@ -433,27 +236,55 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e
                 "sort_crs_graph: entries_t is not accessible from the given execution "
                 "space");
   static_assert(!std::is_const_v<typename entries_t::value_type>, "sort_crs_graph: entries_t must not be const-valued");
-  bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
-  lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
-  if (numRows == 0) return;
-  Impl::SortCrsGraphFunctor<execution_space, rowmap_t, entries_t> funct(useRadix, rowmap, entries);
-  if (useRadix) {
-    Kokkos::parallel_for("sort_crs_graph", Kokkos::RangePolicy<execution_space>(exec, 0, numRows), funct);
+  Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
+  if (entries.extent(0) <= size_t(1)) {
+    return;
+  }
+  if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>()) {
+    // If on CPU, sort each row independently. Don't need to know numCols for
+    // this.
+    Kokkos::parallel_for("sort_crs_graph[CPU,radix]",
+                         Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>(exec, 0, numRows),
+                         Impl::GraphRadixSortFunctor<rowmap_t, entries_t>(rowmap, entries));
   } else {
-    // Try to get teamsize to be largest power of 2 less than or equal to
-    // half the entries per row. 0.5 * #entries is bitonic's parallelism within
-    // a row.
-    // TODO (probably important for performnce): add thread-level sort also, and
-    // use that for small avg degree. But this works for now.
-    lno_t idealTeamSize = 1;
-    lno_t avgDeg        = (entries.extent(0) + numRows - 1) / numRows;
-    while (idealTeamSize < avgDeg / 2) {
-      idealTeamSize *= 2;
+    // On GPUs:
+    //   If the graph is highly imbalanced AND the dimensions are not too large
+    //   to do one large bulk sort, do that. Otherwise, sort using one Kokkos
+    //   thread per row.
+    Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
+#ifndef KK_DISABLE_BULK_SORT_BY_KEY
+    Ordinal maxDeg   = KokkosSparse::Impl::graph_max_degree(exec, rowmap);
+    bool useBulkSort = false;
+    if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) {
+      // Calculate the true number of columns if user didn't pass it in
+      if (numCols == Kokkos::ArithTraits<Ordinal>::max()) {
+        KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, numCols);
+        numCols++;
+      }
+      uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols;
+      useBulkSort         = maxBulkKey / numRows == (uint64_t)numCols;
+    }
+    if (useBulkSort) {
+      auto keys = KokkosSparse::Impl::generateBulkCrsKeys(exec, rowmap, entries, numCols);
+      Kokkos::Experimental::sort_by_key(exec, keys, entries);
+    } else
+#else
+    (void)numCols;
+#endif
+    {
+      using TeamPol = Kokkos::TeamPolicy<execution_space>;
+      // Fall back to thread-level sort within each row
+      Ordinal vectorLength = 1;
+      while (vectorLength < avgDeg / 2) {
+        vectorLength *= 2;
+      }
+      if (vectorLength > TeamPol ::vector_length_max()) vectorLength = TeamPol ::vector_length_max();
+
+      Impl::GraphThreadSortFunctor<TeamPol, Ordinal, rowmap_t, entries_t> funct(numRows, rowmap, entries);
+      Ordinal teamSize = TeamPol(exec, 1, 1, vectorLength).team_size_recommended(funct, Kokkos::ParallelForTag());
+      Kokkos::parallel_for("sort_crs_graph[GPU,bitonic]",
+                           TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct);
     }
-    team_pol temp(exec, numRows, 1);
-    lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag());
-    lno_t teamSize    = std::min(idealTeamSize, maxTeamSize);
-    Kokkos::parallel_for("sort_crs_graph", team_pol(exec, numRows, teamSize), funct);
   }
 }
 
@@ -462,36 +293,38 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) {
   sort_crs_graph(execution_space(), rowmap, entries);
 }
 
-// This overload covers 2 cases, while allowing all template args to be deduced:
-//  - sort_crs_graph(exec, G)
-//  - sort_crs_graph(rowmap, entries)
-template <typename Arg1, typename Arg2>
-void sort_crs_graph(const Arg1& a1, const Arg2& a2) {
-  if constexpr (Kokkos::is_execution_space_v<Arg1>) {
-    // a1 is an exec instance, a2 is a graph
-    sort_crs_graph(a1, a2.row_map, a2.entries);
-  } else if constexpr (Kokkos::is_view_v<Arg1>) {
-    // a1 is rowmap, a2 is entries
-    sort_crs_graph(typename Arg2::execution_space(), a1, a2);
-  } else {
-    static_assert(Arg1::doesnthavethisthing,
-                  "sort_crs_graph(arg1, arg2): expect either (exec, G) or "
-                  "(rowmap, entries)");
-  }
+template <typename rowmap_t, typename entries_t>
+typename std::enable_if_t<Kokkos::is_view_v<rowmap_t>> sort_crs_graph(
+    const rowmap_t& rowmap, const entries_t& entries,
+    typename entries_t::const_value_type& numCols =
+        Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  sort_crs_graph(typename entries_t::execution_space(), rowmap, entries, numCols);
+}
+
+template <typename execution_space, typename crsGraph_t>
+typename std::enable_if_t<Kokkos::is_execution_space_v<execution_space>> sort_crs_graph(
+    const execution_space& exec, const crsGraph_t& G,
+    typename crsGraph_t::entries_type::const_value_type& numCols =
+        Kokkos::ArithTraits<typename crsGraph_t::entries_type::non_const_value_type>::max()) {
+  sort_crs_graph(exec, G.row_map, G.entries, numCols);
 }
 
 template <typename crsGraph_t>
-void sort_crs_graph(const crsGraph_t& G) {
-  sort_crs_graph(typename crsGraph_t::execution_space(), G);
+void sort_crs_graph(const crsGraph_t& G,
+                    typename crsGraph_t::entries_type::const_value_type& numCols =
+                        Kokkos::ArithTraits<typename crsGraph_t::entries_type::non_const_value_type>::max()) {
+  sort_crs_graph(typename crsGraph_t::execution_space(), G, numCols);
 }
 
 template <typename exec_space, typename rowmap_t, typename entries_t, typename values_t>
 void sort_and_merge_matrix(const exec_space& exec, const typename rowmap_t::const_type& rowmap_in,
                            const entries_t& entries_in, const values_t& values_in, rowmap_t& rowmap_out,
-                           entries_t& entries_out, values_t& values_out) {
+                           entries_t& entries_out, values_t& values_out,
+                           typename entries_t::const_value_type& numCols =
+                               Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
   using nc_rowmap_t = typename rowmap_t::non_const_type;
-  using size_type   = typename nc_rowmap_t::value_type;
-  using ordinal_t   = typename entries_t::value_type;
+  using Offset      = typename nc_rowmap_t::value_type;
+  using Ordinal     = typename entries_t::value_type;
   using range_t     = Kokkos::RangePolicy<exec_space>;
   static_assert(Kokkos::SpaceAccessibility<exec_space, typename rowmap_t::memory_space>::accessible,
                 "sort_and_merge_matrix: rowmap_t is not accessible from the given "
@@ -507,8 +340,8 @@ void sort_and_merge_matrix(const exec_space& exec, const typename rowmap_t::cons
   static_assert(!std::is_const_v<typename values_t::value_type>,
                 "sort_and_merge_matrix: value_t must not be const-valued");
 
-  ordinal_t numRows = rowmap_in.extent(0) ? ordinal_t(rowmap_in.extent(0) - 1) : ordinal_t(0);
-  size_type nnz     = entries_in.extent(0);
+  Ordinal numRows = rowmap_in.extent(0) ? Ordinal(rowmap_in.extent(0) - 1) : Ordinal(0);
+  Offset nnz      = entries_in.extent(0);
 
   if (numRows == 0) {
     rowmap_out  = typename rowmap_t::non_const_type("SortedMerged rowmap", rowmap_in.extent(0));
@@ -517,13 +350,13 @@ void sort_and_merge_matrix(const exec_space& exec, const typename rowmap_t::cons
     return;
   }
 
-  sort_crs_matrix(exec, rowmap_in, entries_in, values_in);
+  sort_crs_matrix(exec, rowmap_in, entries_in, values_in, numCols);
 
   // Count entries per row into a new rowmap, in terms of merges that can be
   // done
   nc_rowmap_t nc_rowmap_out(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "SortedMerged rowmap"), numRows + 1);
-  size_type numCompressedEntries = 0;
-  Kokkos::parallel_reduce(range_t(exec, 0, numRows),
+  Offset numCompressedEntries = 0;
+  Kokkos::parallel_reduce("KokkosSparse::Impl::MergedRowmapFunctor", range_t(exec, 0, numRows),
                           Impl::MergedRowmapFunctor<nc_rowmap_t, entries_t>(nc_rowmap_out, rowmap_in, entries_in),
                           numCompressedEntries);
   if (nnz == numCompressedEntries) {
@@ -555,7 +388,7 @@ void sort_and_merge_matrix(const exec_space& exec, const typename rowmap_t::cons
   values_out =
       values_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "SortedMerged values"), numCompressedEntries);
   // Compute merged entries and values
-  Kokkos::parallel_for(range_t(exec, 0, numRows),
+  Kokkos::parallel_for("KokkosSparse::Impl::MatrixMergedEntriesFunctor", range_t(exec, 0, numRows),
                        Impl::MatrixMergedEntriesFunctor<rowmap_t, entries_t, values_t>(
                            rowmap_orig, entries_orig, values_orig, rowmap_out, entries_out, values_out));
 }
@@ -571,7 +404,8 @@ crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, c
   entries_t entries_out;
   values_t values_out;
 
-  sort_and_merge_matrix(exec, A.graph.row_map, A.graph.entries, A.values, rowmap_out, entries_out, values_out);
+  sort_and_merge_matrix(exec, A.graph.row_map, A.graph.entries, A.values, rowmap_out, entries_out, values_out,
+                        A.numCols());
 
   return crsMat_t("SortedMerged", A.numRows(), A.numCols(), values_out.extent(0), values_out, rowmap_out, entries_out);
 }
@@ -584,23 +418,29 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
 template <typename exec_space, typename rowmap_t, typename entries_t, typename values_t>
 void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
                            const values_t& values_in, rowmap_t& rowmap_out, entries_t& entries_out,
-                           values_t& values_out) {
-  sort_and_merge_matrix(exec_space(), rowmap_in, entries_in, values_in, rowmap_out, entries_out, values_out);
+                           values_t& values_out,
+                           typename entries_t::const_value_type& numCols =
+                               Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  sort_and_merge_matrix(exec_space(), rowmap_in, entries_in, values_in, rowmap_out, entries_out, values_out, numCols);
 }
 
 template <typename rowmap_t, typename entries_t, typename values_t>
 void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
                            const values_t& values_in, rowmap_t& rowmap_out, entries_t& entries_out,
-                           values_t& values_out) {
+                           values_t& values_out,
+                           typename entries_t::const_value_type& numCols =
+                               Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
   sort_and_merge_matrix(typename entries_t::execution_space(), rowmap_in, entries_in, values_in, rowmap_out,
-                        entries_out, values_out);
+                        entries_out, values_out, numCols);
 }
 
 template <typename exec_space, typename rowmap_t, typename entries_t>
 void sort_and_merge_graph(const exec_space& exec, const typename rowmap_t::const_type& rowmap_in,
-                          const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out) {
-  using size_type   = typename rowmap_t::non_const_value_type;
-  using lno_t       = typename entries_t::value_type;
+                          const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out,
+                          typename entries_t::const_value_type& numCols =
+                              Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  using Offset      = typename rowmap_t::non_const_value_type;
+  using Ordinal     = typename entries_t::value_type;
   using range_t     = Kokkos::RangePolicy<exec_space>;
   using nc_rowmap_t = typename rowmap_t::non_const_type;
   static_assert(Kokkos::SpaceAccessibility<exec_space, typename rowmap_t::memory_space>::accessible,
@@ -612,19 +452,19 @@ void sort_and_merge_graph(const exec_space& exec, const typename rowmap_t::const
   static_assert(!std::is_const_v<typename entries_t::value_type>,
                 "sort_and_merge_graph: entries_t must not be const-valued");
 
-  lno_t numRows = rowmap_in.extent(0) ? rowmap_in.extent(0) - 1 : 0;
+  Ordinal numRows = rowmap_in.extent(0) ? rowmap_in.extent(0) - 1 : 0;
   if (numRows == 0) {
     rowmap_out  = typename rowmap_t::non_const_type("SortedMerged rowmap", rowmap_in.extent(0));
     entries_out = entries_t();
     return;
   }
   // Sort in place
-  sort_crs_graph(exec, rowmap_in, entries_in);
+  sort_crs_graph(exec, rowmap_in, entries_in, numCols);
   // Count entries per row into a new rowmap, in terms of merges that can be
   // done
   nc_rowmap_t nc_rowmap_out(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "SortedMerged rowmap"), numRows + 1);
-  size_type numCompressedEntries = 0;
-  Kokkos::parallel_reduce(range_t(exec, 0, numRows),
+  Offset numCompressedEntries = 0;
+  Kokkos::parallel_reduce("KokkosSparse::Impl::MergedRowmapFunctor", range_t(exec, 0, numRows),
                           Impl::MergedRowmapFunctor<rowmap_t, entries_t>(nc_rowmap_out, rowmap_in, entries_in),
                           numCompressedEntries);
   if (entries_in.extent(0) == size_t(numCompressedEntries)) {
@@ -655,107 +495,50 @@ void sort_and_merge_graph(const exec_space& exec, const typename rowmap_t::const
   entries_out =
       entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "SortedMerged entries"), numCompressedEntries);
   // Compute merged entries and values
-  Kokkos::parallel_for(range_t(exec, 0, numRows), Impl::GraphMergedEntriesFunctor<rowmap_t, entries_t>(
-                                                      rowmap_orig, entries_orig, rowmap_out, entries_out));
+  Kokkos::parallel_for(
+      "KokkosSparse::Impl::GraphMergedEntriesFunctor", range_t(exec, 0, numRows),
+      Impl::GraphMergedEntriesFunctor<rowmap_t, entries_t>(rowmap_orig, entries_orig, rowmap_out, entries_out));
 }
 
 template <typename exec_space, typename rowmap_t, typename entries_t>
 void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
-                          rowmap_t& rowmap_out, entries_t& entries_out) {
-  return sort_and_merge_graph(exec_space(), rowmap_in, entries_in, rowmap_out, entries_out);
+                          rowmap_t& rowmap_out, entries_t& entries_out,
+                          typename entries_t::const_value_type& numCols =
+                              Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  return sort_and_merge_graph(exec_space(), rowmap_in, entries_in, rowmap_out, entries_out, numCols);
 }
 
 template <typename rowmap_t, typename entries_t>
 void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
-                          rowmap_t& rowmap_out, entries_t& entries_out) {
-  return sort_and_merge_graph(typename entries_t::execution_space(), rowmap_in, entries_in, rowmap_out, entries_out);
+                          rowmap_t& rowmap_out, entries_t& entries_out,
+                          typename entries_t::const_value_type& numCols =
+                              Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
+  return sort_and_merge_graph(typename entries_t::execution_space(), rowmap_in, entries_in, rowmap_out, entries_out,
+                              numCols);
 }
 
 template <typename crsGraph_t>
-crsGraph_t sort_and_merge_graph(const typename crsGraph_t::execution_space& exec, const crsGraph_t& G) {
+crsGraph_t sort_and_merge_graph(
+    const typename crsGraph_t::execution_space& exec, const crsGraph_t& G,
+    typename crsGraph_t::entries_type::const_value_type& numCols =
+        Kokkos::ArithTraits<typename crsGraph_t::entries_type::non_const_value_type>::max()) {
   using rowmap_t  = typename crsGraph_t::row_map_type::non_const_type;
   using entries_t = typename crsGraph_t::entries_type;
   static_assert(!std::is_const<typename entries_t::value_type>::value,
                 "sort_and_merge_graph requires StaticCrsGraph entries to be non-const.");
   rowmap_t mergedRowmap;
   entries_t mergedEntries;
-  sort_and_merge_graph(exec, G.row_map, G.entries, mergedRowmap, mergedEntries);
+  sort_and_merge_graph(exec, G.row_map, G.entries, mergedRowmap, mergedEntries, numCols);
   return crsGraph_t(mergedEntries, mergedRowmap);
 }
 
 template <typename crsGraph_t>
-crsGraph_t sort_and_merge_graph(const crsGraph_t& G) {
-  return sort_and_merge_graph(typename crsGraph_t::execution_space(), G);
+crsGraph_t sort_and_merge_graph(
+    const crsGraph_t& G, typename crsGraph_t::entries_type::const_value_type& numCols =
+                             Kokkos::ArithTraits<typename crsGraph_t::entries_type::non_const_value_type>::max()) {
+  return sort_and_merge_graph(typename crsGraph_t::execution_space(), G, numCols);
 }
 
 }  // namespace KokkosSparse
 
-namespace KokkosKernels {
-
-// ----------------------------------
-// BSR matrix/graph sorting utilities
-// ----------------------------------
-
-// Sort a BRS matrix: within each row, sort entries ascending by column and
-// permute the values accordingly.
-template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t,
-          typename lno_t = typename entries_t::non_const_value_type>
-[[deprecated]] void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const entries_t& entries,
-                                    const values_t& values) {
-  KokkosSparse::sort_bsr_matrix(blockdim, rowmap, entries, values);
-}
-
-template <typename bsrMat_t>
-[[deprecated]] void sort_bsr_matrix(const bsrMat_t& A) {
-  KokkosSparse::sort_bsr_matrix(A);
-}
-
-// ----------------------------------
-// CRS matrix/graph sorting utilities
-// ----------------------------------
-
-// The sort_crs* functions sort the adjacent column list for each row into
-// ascending order.
-
-template <typename execution_space, typename rowmap_t, typename entries_t, typename values_t>
-[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) {
-  KokkosSparse::sort_crs_matrix<execution_space, rowmap_t, entries_t>(rowmap, entries, values);
-}
-
-template <typename crsMat_t>
-[[deprecated]] void sort_crs_matrix(const crsMat_t& A) {
-  KokkosSparse::sort_crs_matrix(A);
-}
-
-template <typename execution_space, typename rowmap_t, typename entries_t>
-[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) {
-  KokkosSparse::sort_crs_graph<execution_space, rowmap_t, entries_t>(rowmap, entries);
-}
-
-template <typename crsGraph_t>
-[[deprecated]] void sort_crs_graph(const crsGraph_t& G) {
-  KokkosSparse::sort_crs_graph(G);
-}
-
-// sort_and_merge_matrix produces a new matrix which is equivalent to A but is
-// sorted and has no duplicated entries: each (i, j) is unique. Values for
-// duplicated entries are summed.
-template <typename crsMat_t>
-[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) {
-  KokkosSparse::sort_and_merge_matrix(A);
-}
-
-template <typename crsGraph_t>
-[[deprecated]] crsGraph_t sort_and_merge_graph(const crsGraph_t& G) {
-  KokkosSparse::sort_and_merge_graph(G);
-}
-
-template <typename exec_space, typename rowmap_t, typename entries_t>
-[[deprecated]] void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in,
-                                         rowmap_t& rowmap_out, entries_t& entries_out) {
-  KokkosSparse::sort_and_merge_graph(rowmap_in, entries_in, rowmap_out, entries_out);
-}
-
-}  // namespace KokkosKernels
-
 #endif  // _KOKKOSSPARSE_SORTCRS_HPP
diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp
index 781857ef551f..d73787481e0e 100644
--- a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp
+++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp
@@ -848,6 +848,19 @@ ordinal_t graph_max_degree(const rowmap_t &rowmap) {
   return val;
 }
 
+template <typename execution_space, typename rowmap_t>
+typename rowmap_t::non_const_value_type graph_max_degree(const execution_space &exec, const rowmap_t &rowmap) {
+  using Offset  = typename rowmap_t::non_const_value_type;
+  using Reducer = Kokkos::Max<Offset>;
+  Offset nrows  = rowmap.extent(0);
+  if (nrows) nrows--;
+  if (nrows == 0) return 0;
+  Offset val;
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(exec, 0, nrows),
+                          MaxDegreeFunctor<Reducer, Offset, rowmap_t>(rowmap), Reducer(val));
+  return val;
+}
+
 template <typename device_t, typename ordinal_t, typename rowmap_t>
 void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree, ordinal_t &max_degree) {
   using Reducer   = Kokkos::MinMax<ordinal_t>;
diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd_handle.hpp
index ea9594ca3e2f..8d28309585a7 100644
--- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd_handle.hpp
+++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd_handle.hpp
@@ -102,10 +102,6 @@ class SPADDHandle {
    */
   size_type get_c_nnz() { return this->result_nnz_size; }
 
-  void set_sort_option(int option) { this->sort_option = option; }
-
-  int get_sort_option() { return this->sort_option; }
-
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   SpaddCusparseData cusparseData;
 #endif
diff --git a/packages/muelu/cmake/MueLu_config.hpp.in b/packages/muelu/cmake/MueLu_config.hpp.in
index c120b66affd4..9f02daefb9b7 100644
--- a/packages/muelu/cmake/MueLu_config.hpp.in
+++ b/packages/muelu/cmake/MueLu_config.hpp.in
@@ -124,4 +124,6 @@
 */
 @MUELU_DEPRECATED_DECLARATIONS@
 
+#cmakedefine MueLu_SHOW_DEPRECATED_WARNINGS
+
 #endif /* MUELU_CONFIG_HPP */
diff --git a/packages/muelu/src/Interface/MueLu_MLParameterListInterpreter_decl.hpp b/packages/muelu/src/Interface/MueLu_MLParameterListInterpreter_decl.hpp
index b1cc80c1181d..8b22c8fc7f90 100644
--- a/packages/muelu/src/Interface/MueLu_MLParameterListInterpreter_decl.hpp
+++ b/packages/muelu/src/Interface/MueLu_MLParameterListInterpreter_decl.hpp
@@ -49,7 +49,7 @@
 #endif
 
 #ifdef HAVE_MUELU_DEPRECATED_CODE
-#ifndef TRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS
+#ifdef MueLu_SHOW_DEPRECATED_WARNINGS
 #warning "The header file MueLu_MLParameterListInterpreter.hpp is deprecated"
 #endif
 #else
diff --git a/packages/phalanx/test/DagManager/DagManagerTest.cpp b/packages/phalanx/test/DagManager/DagManagerTest.cpp
index 34b9dc36caca..ffee37324306 100644
--- a/packages/phalanx/test/DagManager/DagManagerTest.cpp
+++ b/packages/phalanx/test/DagManager/DagManagerTest.cpp
@@ -50,8 +50,8 @@ void registerDagNodes(PHX::DagManager<PHX::MyTraits>& em,
     RCP<Mock> a = rcp(new Mock);
     a->setName("Eval_A");
     a->evaluates("A");
-    a->requires("B");
-    a->requires("C");
+    a->depends("B");
+    a->depends("C");
     em.registerEvaluator(a);
   }
 
@@ -60,7 +60,7 @@ void registerDagNodes(PHX::DagManager<PHX::MyTraits>& em,
     b->setName("Eval_B");
     b->evaluates("B");
     b->evaluates("D");
-    b->requires("E");
+    b->depends("E");
     em.registerEvaluator(b);
   }
 
@@ -68,7 +68,7 @@ void registerDagNodes(PHX::DagManager<PHX::MyTraits>& em,
     RCP<Mock> c = rcp(new Mock);
     c->setName("Eval_C");
     c->evaluates("C");
-    c->requires("E");
+    c->depends("E");
     em.registerEvaluator(c);
   }
 
@@ -77,7 +77,7 @@ void registerDagNodes(PHX::DagManager<PHX::MyTraits>& em,
     e->setName("Eval_E");
     e->evaluates("E");
     if (addCircularDependency)
-      e->requires("D");
+      e->depends("D");
     em.registerEvaluator(e);
   }
 
@@ -86,7 +86,7 @@ void registerDagNodes(PHX::DagManager<PHX::MyTraits>& em,
     RCP<Mock> c = rcp(new Mock);
     c->setName("DUPLICATE Eval_C");
     c->evaluates("C");
-    c->requires("E");
+    c->depends("E");
     em.registerEvaluator(c);
   }
 }
@@ -342,22 +342,22 @@ TEUCHOS_UNIT_TEST(dag, analyze_graph2)
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_A");
     m->evaluates("A");
-    m->requires("B");
-    m->requires("C");
+    m->depends("B");
+    m->depends("C");
     dag.registerEvaluator(m);
   }
   {
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_B");
     m->evaluates("B");
-    m->requires("D");
+    m->depends("D");
     dag.registerEvaluator(m);
   }
   {
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_C");
     m->evaluates("C");
-    m->requires("D");
+    m->depends("D");
     dag.registerEvaluator(m);
   }
   {
@@ -469,8 +469,8 @@ TEUCHOS_UNIT_TEST(dag, contrib_and_eval_B)
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_A");
     m->evaluates("A");
-    m->requires("B");
-    m->requires("C");
+    m->depends("B");
+    m->depends("C");
     dag.registerEvaluator(m);
   }
   {
@@ -483,7 +483,7 @@ TEUCHOS_UNIT_TEST(dag, contrib_and_eval_B)
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_C");
     m->evaluates("C");
-    m->requires("D");
+    m->depends("D");
     dag.registerEvaluator(m);
   }
   {
@@ -496,14 +496,14 @@ TEUCHOS_UNIT_TEST(dag, contrib_and_eval_B)
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_B+");
     m->contributes("B");
-    m->requires("D");
+    m->depends("D");
     dag.registerEvaluator(m);
   }
   { // Contributes to B also
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_B++");
     m->contributes("B");
-    m->requires("D");
+    m->depends("D");
     dag.registerEvaluator(m);
   }
 
@@ -572,15 +572,15 @@ TEUCHOS_UNIT_TEST(dag, contrib_only_B)
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_A");
     m->evaluates("A");
-    m->requires("B");
-    m->requires("C");
+    m->depends("B");
+    m->depends("C");
     dag.registerEvaluator(m);
   }
   {
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_C");
     m->evaluates("C");
-    m->requires("D");
+    m->depends("D");
     dag.registerEvaluator(m);
   }
   {
@@ -593,14 +593,14 @@ TEUCHOS_UNIT_TEST(dag, contrib_only_B)
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_B+");
     m->contributes("B");
-    m->requires("D");
+    m->depends("D");
     dag.registerEvaluator(m);
   }
   { // Contributes to B also
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_B++");
     m->contributes("B");
-    m->requires("D");
+    m->depends("D");
     dag.registerEvaluator(m);
   }
 
@@ -665,7 +665,7 @@ TEUCHOS_UNIT_TEST(dag, alias_field)
     RCP<Mock> m = rcp(new Mock);
     m->setName("Eval_A");
     m->evaluates("A");
-    m->requires("B");
+    m->depends("B");
     dag.registerEvaluator(m);
   }
   {
@@ -746,14 +746,14 @@ TEUCHOS_UNIT_TEST(dag, use_range_and_unshared)
     RCP<Mock> e = rcp(new Mock);
     e->setName("c");
     e->evaluates("f3");
-    e->requires("f2");
+    e->depends("f2");
     dag.registerEvaluator(e);
   }
   {
     RCP<Mock> e = rcp(new Mock);
     e->setName("e");
     e->evaluates("f4");
-    e->requires("f3");
+    e->depends("f3");
     dag.registerEvaluator(e);
   }
   {
@@ -766,7 +766,7 @@ TEUCHOS_UNIT_TEST(dag, use_range_and_unshared)
     RCP<Mock> e = rcp(new Mock);
     e->setName("b");
     e->evaluates("f2");
-    e->requires("f1");
+    e->depends("f1");
     e->unshared("f2");
     e->unshared("f1");
     dag.registerEvaluator(e);
@@ -997,7 +997,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_only)
     RCP<Mock> e = rcp(new Mock);
     e->setName("Convection Operator");
     e->contributes("Residual",use_dynamic_layout);
-    e->requires("X",use_dynamic_layout);
+    e->depends("X",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
@@ -1005,7 +1005,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_only)
     RCP<Mock> e = rcp(new Mock);
     e->setName("Diffusion Operator");
     e->contributes("Residual",use_dynamic_layout);
-    e->requires("X",use_dynamic_layout);
+    e->depends("X",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
@@ -1013,7 +1013,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_only)
     RCP<Mock> e = rcp(new Mock);
     e->setName("Reaction Operator");
     e->contributes("Residual",use_dynamic_layout);
-    e->requires("X",use_dynamic_layout);
+    e->depends("X",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
@@ -1023,7 +1023,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_only)
     // Important that this is "contributes" to catch writing graph
     // output correctly.
     e->contributes("Scatter",use_dynamic_layout);
-    e->requires("Residual",use_dynamic_layout);
+    e->depends("Residual",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
@@ -1086,7 +1086,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_and_evalauted)
     RCP<Mock> e = rcp(new Mock);
     e->setName("Initialize");
     e->evaluates("Residual",use_dynamic_layout);
-    e->requires("X",use_dynamic_layout);
+    e->depends("X",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
@@ -1094,7 +1094,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_and_evalauted)
     RCP<Mock> e = rcp(new Mock);
     e->setName("Convection Operator");
     e->contributes("Residual",use_dynamic_layout);
-    e->requires("X",use_dynamic_layout);
+    e->depends("X",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
@@ -1102,7 +1102,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_and_evalauted)
     RCP<Mock> e = rcp(new Mock);
     e->setName("Diffusion Operator");
     e->contributes("Residual",use_dynamic_layout);
-    e->requires("X",use_dynamic_layout);
+    e->depends("X",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
@@ -1110,7 +1110,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_and_evalauted)
     RCP<Mock> e = rcp(new Mock);
     e->setName("Reaction Operator");
     e->contributes("Residual",use_dynamic_layout);
-    e->requires("X",use_dynamic_layout);
+    e->depends("X",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
@@ -1120,7 +1120,7 @@ TEUCHOS_UNIT_TEST(contrib, basic_contrib_and_evalauted)
     // Important that this is "contributes" to catch writing graph
     // output correctly.
     e->contributes("Scatter",use_dynamic_layout);
-    e->requires("Residual",use_dynamic_layout);
+    e->depends("Residual",use_dynamic_layout);
     dm.registerEvaluator(e);
   }
 
diff --git a/packages/phalanx/test/EvaluatorMacros/EvaluatorMacrosTest.cpp b/packages/phalanx/test/EvaluatorMacros/EvaluatorMacrosTest.cpp
index 57081eb2fe88..abb938d199e8 100644
--- a/packages/phalanx/test/EvaluatorMacros/EvaluatorMacrosTest.cpp
+++ b/packages/phalanx/test/EvaluatorMacros/EvaluatorMacrosTest.cpp
@@ -46,8 +46,8 @@ TEUCHOS_UNIT_TEST(evaluator_macros, basic)
     RCP<Ev1> a = rcp(new Ev1(*plist_a));
     a->setName("Eval_A");
     a->evaluates("A");
-    a->requires("B");
-    a->requires("C");
+    a->depends("B");
+    a->depends("C");
     fm.registerEvaluator<MyTraits::Residual>(a);
   }
   {
@@ -55,7 +55,7 @@ TEUCHOS_UNIT_TEST(evaluator_macros, basic)
     RCP<Ev2> b = rcp(new Ev2(*plist_b));
     b->setName("Eval_B");
     b->evaluates("B");
-    b->requires("D");
+    b->depends("D");
     fm.registerEvaluator<MyTraits::Residual>(b);
   }
   {
@@ -63,7 +63,7 @@ TEUCHOS_UNIT_TEST(evaluator_macros, basic)
     RCP<Ev2> c = rcp(new Ev2(*plist_c));
     c->setName("Eval_C");
     c->evaluates("C");
-    c->requires("D");
+    c->depends("D");
     fm.registerEvaluator<MyTraits::Residual>(c);
   }
   {
diff --git a/packages/phalanx/test/EvaluatorMacros/EvaluatorWithMacros.hpp b/packages/phalanx/test/EvaluatorMacros/EvaluatorWithMacros.hpp
index 13badbfcdb06..de1a86677e2b 100644
--- a/packages/phalanx/test/EvaluatorMacros/EvaluatorWithMacros.hpp
+++ b/packages/phalanx/test/EvaluatorMacros/EvaluatorWithMacros.hpp
@@ -19,7 +19,7 @@ namespace PHX {
   PHX_EVALUATOR_CLASS(EvaluatorWithMacros1)
   public:
     void evaluates(const std::string& field_name);
-    void requires(const std::string& field_name);
+    void depends(const std::string& field_name);
     void bindField(const PHX::FieldTag& ft, const std::any& f);
   PHX_EVALUATOR_CLASS_END
   
@@ -27,7 +27,7 @@ namespace PHX {
   PHX_EVALUATOR_CLASS_PP(EvaluatorWithMacros2)
   public:
     void evaluates(const std::string& field_name);
-    void requires(const std::string& field_name);
+    void depends(const std::string& field_name);
     void bindField(const PHX::FieldTag& ft, const std::any& f);
   PHX_EVALUATOR_CLASS_END
 
diff --git a/packages/phalanx/test/EvaluatorMacros/EvaluatorWithMacros_Def.hpp b/packages/phalanx/test/EvaluatorMacros/EvaluatorWithMacros_Def.hpp
index 8361a2bc070c..8e35a31a3850 100644
--- a/packages/phalanx/test/EvaluatorMacros/EvaluatorWithMacros_Def.hpp
+++ b/packages/phalanx/test/EvaluatorMacros/EvaluatorWithMacros_Def.hpp
@@ -45,7 +45,7 @@ namespace PHX {
   }
 
   template<typename EvalT,typename Traits>
-  void EvaluatorWithMacros1<EvalT,Traits>::requires(const std::string& n)
+  void EvaluatorWithMacros1<EvalT,Traits>::depends(const std::string& n)
   {
     using Teuchos::RCP;
     using Teuchos::rcp;
@@ -105,7 +105,7 @@ namespace PHX {
   }
 
   template<typename EvalT,typename Traits>
-  void EvaluatorWithMacros2<EvalT,Traits>::requires(const std::string& n)
+  void EvaluatorWithMacros2<EvalT,Traits>::depends(const std::string& n)
   {
     using Teuchos::RCP;
     using Teuchos::rcp;
diff --git a/packages/phalanx/test/Kokkos/CMakeLists.txt b/packages/phalanx/test/Kokkos/CMakeLists.txt
index a13eaf8b99a6..98b582cd8c2a 100644
--- a/packages/phalanx/test/Kokkos/CMakeLists.txt
+++ b/packages/phalanx/test/Kokkos/CMakeLists.txt
@@ -1,11 +1,14 @@
 TRIBITS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 TRIBITS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/../Utilities)
 
+# RUN_SERIAL is added since UniqueToken can require a large amount of
+# memory on GPUs.
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   tKokkos
   SOURCES tKokkos.cpp
   TESTONLYLIBS phalanx_unit_test_main phalanx_test_utilities
   NUM_MPI_PROCS 1
+  RUN_SERIAL
   )
 
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
diff --git a/packages/phalanx/test/Kokkos/tKokkos.cpp b/packages/phalanx/test/Kokkos/tKokkos.cpp
index cdadac61779d..b42e8355e4b3 100644
--- a/packages/phalanx/test/Kokkos/tKokkos.cpp
+++ b/packages/phalanx/test/Kokkos/tKokkos.cpp
@@ -1,6 +1,6 @@
 // @HEADER
 // *****************************************************************************
-//        Phalanx: A Partial Differential Equation Field Evaluation 
+//        Phalanx: A Partial Differential Equation Field Evaluation
 //       Kernel for Flexible Management of Complex Dependency Chains
 //
 // Copyright 2008 NTESS and the Phalanx contributors.
@@ -828,6 +828,8 @@ namespace phalanx_test {
 
 #if defined(KOKKOS_ENABLE_CUDA)
     using DefaultFadLayout = Kokkos::LayoutContiguous<DefaultDevLayout,32>;
+#elif defined(KOKKOS_ENABLE_HIP)
+    using DefaultFadLayout = Kokkos::LayoutContiguous<DefaultDevLayout,64>;
 #else
     using DefaultFadLayout = Kokkos::LayoutContiguous<DefaultDevLayout,1>;
 #endif
@@ -841,13 +843,13 @@ namespace phalanx_test {
     static_assert(std::is_same<scalar_view_layout,DefaultDevLayout>::value,"ERROR: Layout Inconsistency!");
     static_assert(std::is_same<fad_view_layout,DefaultFadLayout>::value,"ERROR: Layout Inconsistency!");
 
-    std::cout << "\n\nscalar_view_layout = " << PHX::print<scalar_view_layout>() << std::endl;
-    std::cout << "scalar_dev_layout  = " << PHX::print<scalar_dev_layout>() << std::endl;
-    std::cout << "DefaultDevLayout   = " << PHX::print<DefaultDevLayout>() << "\n" << std::endl;
+    out << "\n\nscalar_view_layout = " << PHX::print<scalar_view_layout>() << std::endl;
+    out << "scalar_dev_layout  = " << PHX::print<scalar_dev_layout>() << std::endl;
+    out << "DefaultDevLayout   = " << PHX::print<DefaultDevLayout>() << "\n" << std::endl;
 
-    std::cout << "fad_view_layout    = " << PHX::print<fad_view_layout>() << std::endl;
-    std::cout << "fad_dev_layout     = " << PHX::print<fad_dev_layout>() << std::endl;
-    std::cout << "DefaultFadLayout   = " << PHX::print<DefaultFadLayout>() << "\n" << std::endl;
+    out << "fad_view_layout    = " << PHX::print<fad_view_layout>() << std::endl;
+    out << "fad_dev_layout     = " << PHX::print<fad_dev_layout>() << std::endl;
+    out << "DefaultFadLayout   = " << PHX::print<DefaultFadLayout>() << "\n" << std::endl;
 
     // Tests for assignments from static View to DynRankView
     Kokkos::View<FadType**,typename PHX::DevLayout<FadType>::type,PHX::Device> static_a("static_a",100,8,64);
@@ -969,4 +971,102 @@ namespace phalanx_test {
     TEST_FLOATING_EQUALITY(mean,mean_gold,tol);
     TEST_FLOATING_EQUALITY(stddev,stddev_gold,tol);
   }
+
+  struct Inner {
+    Kokkos::Experimental::UniqueToken<Kokkos::DefaultExecutionSpace> token_;
+  };
+
+  struct Outer {
+    Inner inner_;
+  };
+
+  TEUCHOS_UNIT_TEST(kokkos, UniqueToken)
+  {
+    Kokkos::print_configuration(out);
+
+    using ExecutionSpace = PHX::exec_space;
+
+    Kokkos::Experimental::UniqueToken<ExecutionSpace> token;
+
+    out << "\nExecutionSpace.concurrency() = " << ExecutionSpace().concurrency() << std::endl;
+    out << "UniqueToken.size() = " << token.size() << std::endl;
+
+    TEST_EQUALITY(ExecutionSpace().concurrency(), token.size());
+
+    const size_t num_elements = token.size()+10;
+    Outer o;
+
+    Kokkos::View<int*> scratch_space("scratch space",token.size());
+    Kokkos::parallel_for("unique token",num_elements,KOKKOS_LAMBDA(const int cell){
+        Kokkos::Experimental::AcquireUniqueToken lock(o.inner_.token_);
+        const auto t = lock.value();
+	scratch_space(t) = cell;
+	// printf("cell=%d, t=%u, equal=%u\n",cell,t,unsigned(cell == t));
+    });
+  }
+
+  TEUCHOS_UNIT_TEST(kokkos, ReduceCheck)
+  {
+    constexpr int size = 10;
+    double gold_sum = 0.0;
+    Kokkos::View<double*> parts("parts",size);
+    auto parts_host = Kokkos::create_mirror_view(parts);
+    for (int i=0; i < size; ++i) {
+      parts_host(i) = double(i);
+
+      if (i%2 == 0)
+        gold_sum += double(i);
+    }
+    Kokkos::deep_copy(parts,parts_host);
+
+    double sum = 0.0;
+    Kokkos::parallel_reduce("sum",10,KOKKOS_LAMBDA(const int i, double& tmp){
+      if (i%2 == 0)
+        tmp += parts(i);
+      // printf("tmp(%d)=%f \n",i,tmp);
+    },sum);
+    out << "sum = " << sum << std::endl;
+    const double tol = Teuchos::ScalarTraits<double>::eps()*1000.0;
+    TEST_FLOATING_EQUALITY(sum,gold_sum,tol);
+  }
+
+  TEUCHOS_UNIT_TEST(kokkos, ScanCheck)
+  {
+    constexpr int size = 10;
+    Kokkos::View<double*> parts("parts",size);
+    auto parts_host = Kokkos::create_mirror_view(parts);
+    for (int i=0; i < size; ++i)
+      parts_host(i)=double(i);
+    Kokkos::deep_copy(parts,parts_host);
+
+    Kokkos::View<double*> inclusive_scan("inclusive",size);
+    Kokkos::View<double*> exclusive_scan("exclusive",size);
+    double result = 0.0;
+    Kokkos::parallel_scan("sum",10,KOKKOS_LAMBDA(const int i, double& partial_sum, const bool is_final){
+      if (is_final)
+        exclusive_scan(i) = partial_sum;
+
+      partial_sum += parts(i);
+
+      if (is_final)
+        inclusive_scan(i) += partial_sum;
+
+      // printf("partial_sum(%d)=%f, is_final=%d \n",i,partial_sum,int(is_final));
+     },result);
+
+    auto is_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),inclusive_scan);
+    auto es_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),exclusive_scan);
+
+    for (int i=0; i < size; ++i)
+      out << "inclusive_scan(" << i << ") = " << is_host(i) << ", parts(" << i << ") = " << parts_host(i) << std::endl;
+    for (int i=0; i < size; ++i)
+      out << "exclusive_scan(" << i << ") = " << es_host(i) << ", parts(" << i << ") = " << parts_host(i) << std::endl;
+    out << "result (exclusive end) = " << result << std::endl;
+
+    const double tol = Teuchos::ScalarTraits<double>::eps()*100.0;
+    for (int i=0; i < size; ++i) {
+      TEST_FLOATING_EQUALITY(is_host(i)-es_host(i), parts_host(i), tol);
+    }
+  }
+
 }
diff --git a/packages/phalanx/test/Utilities/Evaluator_MockDAG.hpp b/packages/phalanx/test/Utilities/Evaluator_MockDAG.hpp
index 378406c1ea0b..3981f832352c 100644
--- a/packages/phalanx/test/Utilities/Evaluator_MockDAG.hpp
+++ b/packages/phalanx/test/Utilities/Evaluator_MockDAG.hpp
@@ -25,7 +25,7 @@ namespace PHX {
 			       PHX::FieldManager<Traits>& fm);
     void evaluateFields(typename Traits::EvalData d);
     void evaluates(const std::string& field_name, const bool use_dynamic_layout=false);
-    void requires(const std::string& field_name, const bool use_dynamic_layout=false);
+    void depends(const std::string& field_name, const bool use_dynamic_layout=false);
     void contributes(const std::string& field_name, const bool use_dynamic_layout=false);
     void unshared(const std::string& field_name);
   };
diff --git a/packages/phalanx/test/Utilities/Evaluator_MockDAG_Def.hpp b/packages/phalanx/test/Utilities/Evaluator_MockDAG_Def.hpp
index d755b35f50ef..4c6884f8cbdd 100644
--- a/packages/phalanx/test/Utilities/Evaluator_MockDAG_Def.hpp
+++ b/packages/phalanx/test/Utilities/Evaluator_MockDAG_Def.hpp
@@ -48,8 +48,8 @@ namespace PHX {
   }
 
   template<typename EvalT,typename Traits>
-  void MockDAG<EvalT,Traits>::requires(const std::string& n,
-                                       const bool use_dynamic_layout)
+  void MockDAG<EvalT,Traits>::depends(const std::string& n,
+                                      const bool use_dynamic_layout)
   {
     using Teuchos::RCP;
     using Teuchos::rcp;
diff --git a/packages/rtop/src/support/RTOpPack_SPMD_apply_op_decl.hpp b/packages/rtop/src/support/RTOpPack_SPMD_apply_op_decl.hpp
index 3e23800b14d4..07bd6071ab84 100644
--- a/packages/rtop/src/support/RTOpPack_SPMD_apply_op_decl.hpp
+++ b/packages/rtop/src/support/RTOpPack_SPMD_apply_op_decl.hpp
@@ -156,7 +156,7 @@ class ReductTargetReductionOp
   Teuchos::RCP<const RTOpT<Scalar> >  op_;
   // Not defined and not to be called!
   ReductTargetReductionOp();
-  ReductTargetReductionOp<Scalar>(const ReductTargetReductionOp<Scalar>&);
+  ReductTargetReductionOp(const ReductTargetReductionOp<Scalar>&);
   ReductTargetReductionOp<Scalar>& operator=(const ReductTargetReductionOp<Scalar>&);
 };
 
diff --git a/packages/sacado/src/KokkosExp_View_Fad_Contiguous.hpp b/packages/sacado/src/KokkosExp_View_Fad_Contiguous.hpp
index 1dcf26543372..c5db121cdb59 100644
--- a/packages/sacado/src/KokkosExp_View_Fad_Contiguous.hpp
+++ b/packages/sacado/src/KokkosExp_View_Fad_Contiguous.hpp
@@ -148,7 +148,11 @@ namespace Sacado {
 
 #include "Sacado_Traits.hpp"
 #include "Kokkos_Core.hpp"
+#if KOKKOS_VERSION >= 40499
+#include "View/Kokkos_ViewMapping.hpp"
+#else
 #include "impl/Kokkos_ViewMapping.hpp"
+#endif
 
 //----------------------------------------------------------------------------
 
diff --git a/packages/sacado/src/Kokkos_LayoutContiguous.hpp b/packages/sacado/src/Kokkos_LayoutContiguous.hpp
index dedc05c78a86..acb722d2fe99 100644
--- a/packages/sacado/src/Kokkos_LayoutContiguous.hpp
+++ b/packages/sacado/src/Kokkos_LayoutContiguous.hpp
@@ -73,6 +73,7 @@ struct inner_layout< LayoutContiguous<Layout, Stride> > {
 
 } // namespace Kokkos
 
+// FIXME This is evil and needs refactoring urgently.
 // Make LayoutContiguous<Layout> equivalent to Layout
 namespace std {
 
@@ -81,14 +82,31 @@ namespace std {
     static const bool value = true;
   };
 
+  template <class Layout, unsigned Stride>
+#if defined(KOKKOS_COMPILER_INTEL)
+  inline constexpr bool is_same_v< Kokkos::LayoutContiguous<Layout,Stride>, Layout> = is_same<Kokkos::LayoutContiguous<Layout,Stride>, Layout>::value;
+#else
+  static constexpr bool is_same_v< Kokkos::LayoutContiguous<Layout,Stride>, Layout> = is_same<Kokkos::LayoutContiguous<Layout,Stride>, Layout>::value;
+#endif
+
   template <class Layout, unsigned Stride>
   struct is_same< Layout, Kokkos::LayoutContiguous<Layout,Stride> > {
     static const bool value = true;
   };
 
+  template <class Layout, unsigned Stride>
+#if defined(KOKKOS_COMPILER_INTEL)
+  inline constexpr bool is_same_v< Layout, Kokkos::LayoutContiguous<Layout,Stride>> = is_same<Kokkos::LayoutContiguous<Layout,Stride>, Layout>::value;
+#else
+  static constexpr bool is_same_v< Layout, Kokkos::LayoutContiguous<Layout,Stride>> = is_same<Kokkos::LayoutContiguous<Layout,Stride>, Layout>::value;
+#endif
 }
 
+#if KOKKOS_VERSION >= 40499
+#include "View/Kokkos_ViewMapping.hpp"
+#else
 #include "impl/Kokkos_ViewMapping.hpp"
+#endif
 
 namespace Kokkos {
 namespace Impl {
diff --git a/packages/sacado/src/Kokkos_LayoutNatural.hpp b/packages/sacado/src/Kokkos_LayoutNatural.hpp
index e4e77d023c1c..1a5ae982295f 100644
--- a/packages/sacado/src/Kokkos_LayoutNatural.hpp
+++ b/packages/sacado/src/Kokkos_LayoutNatural.hpp
@@ -79,7 +79,11 @@ namespace std {
 
 }
 
+#if KOKKOS_VERSION >= 40499
+#include "View/Kokkos_ViewMapping.hpp"
+#else
 #include "impl/Kokkos_ViewMapping.hpp"
+#endif
 
 namespace Kokkos {
 namespace Impl {
diff --git a/packages/sacado/src/new_design/Sacado_Fad_Exp_ViewStorage.hpp b/packages/sacado/src/new_design/Sacado_Fad_Exp_ViewStorage.hpp
index 184eea7bae11..3b67f97d8cf3 100644
--- a/packages/sacado/src/new_design/Sacado_Fad_Exp_ViewStorage.hpp
+++ b/packages/sacado/src/new_design/Sacado_Fad_Exp_ViewStorage.hpp
@@ -79,7 +79,7 @@ namespace Sacado {
       //! Constructor
       SACADO_INLINE_FUNCTION
       ViewStorage(T* v, const int arg_size = 0, const int arg_stride = 0) :
-        sz_(arg_size), stride_(arg_stride), val_(v+sz_.value*stride_.value), dx_(v) {}
+        sz_(arg_size), stride_(arg_stride), val_(v+sz_.value*static_cast<int>(stride_.value)), dx_(v) {}
 
       //! Constructor
       SACADO_INLINE_FUNCTION
diff --git a/packages/seacas/applications/cpup/cpup.C b/packages/seacas/applications/cpup/cpup.C
index c88c46703bda..0024535483e5 100644
--- a/packages/seacas/applications/cpup/cpup.C
+++ b/packages/seacas/applications/cpup/cpup.C
@@ -35,6 +35,19 @@
 
 unsigned int debug_level = 0;
 
+#if FMT_VERSION >= 90000
+namespace fmt {
+  template <> struct formatter<Ioss::ZoneConnectivity> : ostream_formatter
+  {
+  };
+} // namespace fmt
+namespace fmt {
+  template <> struct formatter<Ioss::BoundaryCondition> : ostream_formatter
+  {
+  };
+} // namespace fmt
+#endif
+
 namespace {
   std::string tsFormat = "[{:%H:%M:%S}] ";
 
diff --git a/packages/seacas/applications/epu/epu.C b/packages/seacas/applications/epu/epu.C
index 30a11cefd1a1..79134fd827b7 100644
--- a/packages/seacas/applications/epu/epu.C
+++ b/packages/seacas/applications/epu/epu.C
@@ -1453,8 +1453,8 @@ int epu(SystemInterface &interFace, int start_part, int part_count, int cycle, T
             for (int ig = 0; ig < global_vars.count(InOut::IN); ig++) {
               if (proc_global_values[ig] != global_values[ig]) {
                 fmt::print(stderr,
-                           "At step {:{}}, Global Variable {:{}}, P{:0{}} = {:15.8g}, P{:0{}} = "
-                           "{:15.8g}\n",
+                           fmt::runtime("At step {:{}}, Global Variable {:{}}, P{:0{}} = {:15.8g}, P{:0{}} = "
+					"{:15.8g}\n"),
                            time_step + 1, ts_max + 1, ig + 1,
                            get_width(global_vars.count(InOut::IN)), start_part,
                            get_width(interFace.processor_count()), start_part + p,
diff --git a/packages/seacas/applications/exodiff/edge_block.C b/packages/seacas/applications/exodiff/edge_block.C
index 1debce5fadb0..6696ecdf980a 100644
--- a/packages/seacas/applications/exodiff/edge_block.C
+++ b/packages/seacas/applications/exodiff/edge_block.C
@@ -50,12 +50,12 @@ template <typename INT> void Edge_Block<INT>::entity_load_params()
 
   if (num_edges_per_elmt < 0 || num_attr < 0) {
     Error(fmt::format(
-        "Edge_Block<INT>::entity_load_params(): Data appears corrupt for edge block {}!\n"
-        "\tnum elmts          = {}\n"
-        "\tnum edges per elmt = {}\n"
-        "\tnum attributes     = {}\n"
-        " ... Aborting...\n",
-        fmt::group_digits(numEntity), num_edges_per_elmt, num_attr));
+		      fmt::runtime("Edge_Block<INT>::entity_load_params(): Data appears corrupt for edge block {}!\n"
+				   "\tnum elmts          = {}\n"
+				   "\tnum edges per elmt = {}\n"
+				   "\tnum attributes     = {}\n"
+				   " ... Aborting...\n"),
+		      fmt::group_digits(numEntity), num_edges_per_elmt, num_attr));
   }
 }
 
diff --git a/packages/seacas/applications/exodiff/exo_block.C b/packages/seacas/applications/exodiff/exo_block.C
index ab601e200f7e..4eb93464c746 100644
--- a/packages/seacas/applications/exodiff/exo_block.C
+++ b/packages/seacas/applications/exodiff/exo_block.C
@@ -53,11 +53,11 @@ template <typename INT> void Exo_Block<INT>::entity_load_params()
   elmt_type          = block.topology;
 
   if (num_nodes_per_elmt < 0 || num_attr < 0) {
-    Error(fmt::format("Exo_Block<INT>::entity_load_params(): Data appears corrupt for block {}!\n"
+    Error(fmt::format(fmt::runtime("Exo_Block<INT>::entity_load_params(): Data appears corrupt for block {}!\n"
                       "\tnum elmts          = {}\n"
                       "\tnum nodes per elmt = {}\n"
                       "\tnum attributes     = {}\n"
-                      " ... Aborting...\n",
+				   " ... Aborting...\n"),
                       fmt::group_digits(numEntity), num_nodes_per_elmt, num_attr));
   }
 }
diff --git a/packages/seacas/applications/exodiff/face_block.C b/packages/seacas/applications/exodiff/face_block.C
index 11b4d8ba6fb8..d8d8b7a21f7e 100644
--- a/packages/seacas/applications/exodiff/face_block.C
+++ b/packages/seacas/applications/exodiff/face_block.C
@@ -50,11 +50,11 @@ template <typename INT> void Face_Block<INT>::entity_load_params()
 
   if (num_faces_per_elmt < 0 || num_attr < 0) {
     Error(fmt::format(
-        "Face_Block<INT>::entity_load_params(): Data appears corrupt for face block {}!\n"
-        "\tnum elmts          = {}\n"
-        "\tnum faces per elmt = {}\n"
-        "\tnum attributes     = {}\n"
-        " ... Aborting...\n",
+		      fmt::runtime("Face_Block<INT>::entity_load_params(): Data appears corrupt for face block {}!\n"
+				   "\tnum elmts          = {}\n"
+				   "\tnum faces per elmt = {}\n"
+				   "\tnum attributes     = {}\n"
+				   " ... Aborting...\n"),
         fmt::group_digits(numEntity), num_faces_per_elmt, num_attr));
   }
 }
diff --git a/packages/seacas/applications/nem_spread/pe_input.C b/packages/seacas/applications/nem_spread/pe_input.C
index fda7e6b4f4b6..4fc6af2933f4 100644
--- a/packages/seacas/applications/nem_spread/pe_input.C
+++ b/packages/seacas/applications/nem_spread/pe_input.C
@@ -243,13 +243,13 @@ int read_pexoII_info(NemSpread<T, INT> &spreader, const char *filename)
             /* "{" defines the beginning of the group designator */
             cptr2 = strchr(cptr, '{');
             if (cptr2 == nullptr) {
-              fmt::print(stderr, "fatal: list start designator \"{\" not found");
+              fmt::print(stderr, fmt::runtime("fatal: list start designator \"{\" not found"));
               exit(1);
             }
             cptr2++;
             cptr3 = strchr(cptr, '}');
             if (cptr3 == nullptr) {
-              fmt::print(stderr, "fatal: list end designator \"}\" not found");
+              fmt::print(stderr, fmt::runtime("fatal: list end designator \"}\" not found"));
               exit(1);
             }
             *cptr3 = '\0';
diff --git a/packages/seacas/libraries/ioss/src/Ioss_DecompositionUtils.C b/packages/seacas/libraries/ioss/src/Ioss_DecompositionUtils.C
index d6d5fdd097b4..f3c1e051ca34 100644
--- a/packages/seacas/libraries/ioss/src/Ioss_DecompositionUtils.C
+++ b/packages/seacas/libraries/ioss/src/Ioss_DecompositionUtils.C
@@ -501,14 +501,14 @@ namespace Ioss {
       for (size_t i = 0; i < elem_per_rank.size(); i++) {
         int star_cnt =
             (double)(elem_per_rank[i] - min_work) / (max_work - min_work) * delta + min_star;
-        std::string stars(star_cnt, '*');
-        std::string format = "\tProcessor {:{}}, work = {:{}}  ({:.2f})\t{}\n";
+        std::string       stars(star_cnt, '*');
+        const std::string format = "\tProcessor {:{}}, work = {:{}}  ({:.2f})\t{}\n";
         if (elem_per_rank[i] == max_work) {
           fmt::print(
 #if !defined __NVCC__
               fg(fmt::color::red),
 #endif
-              format, i, proc_width, fmt::group_digits(elem_per_rank[i]), work_width,
+              fmt::runtime(format), i, proc_width, fmt::group_digits(elem_per_rank[i]), work_width,
               (double)elem_per_rank[i] / avg_work, stars);
         }
         else if (elem_per_rank[i] == min_work) {
@@ -516,12 +516,12 @@ namespace Ioss {
 #if !defined __NVCC__
               fg(fmt::color::green),
 #endif
-              format, i, proc_width, fmt::group_digits(elem_per_rank[i]), work_width,
+              fmt::runtime(format), i, proc_width, fmt::group_digits(elem_per_rank[i]), work_width,
               elem_per_rank[i] / avg_work, stars);
         }
         else {
-          fmt::print(format, i, proc_width, fmt::group_digits(elem_per_rank[i]), work_width,
-                     elem_per_rank[i] / avg_work, stars);
+          fmt::print(fmt::runtime(format), i, proc_width, fmt::group_digits(elem_per_rank[i]),
+                     work_width, elem_per_rank[i] / avg_work, stars);
         }
       }
 
diff --git a/packages/seacas/libraries/ioss/src/Ioss_DynamicTopology.C b/packages/seacas/libraries/ioss/src/Ioss_DynamicTopology.C
index 639f6ba47a59..16b73da0a4ee 100644
--- a/packages/seacas/libraries/ioss/src/Ioss_DynamicTopology.C
+++ b/packages/seacas/libraries/ioss/src/Ioss_DynamicTopology.C
@@ -432,7 +432,7 @@ std::string DynamicTopologyFileControl::construct_database_filename(int& step, I
       error_message += "The database FILENAME has not been defined\n";
     }
     std::ostringstream errmsg;
-    fmt::print(errmsg, error_message);
+    fmt::print(errmsg, fmt::runtime(error_message));
     IOSS_ERROR(errmsg);
   }
   assert(!m_ioDB.empty());
diff --git a/packages/seacas/libraries/ioss/src/exodus/Ioex_DatabaseIO.C b/packages/seacas/libraries/ioss/src/exodus/Ioex_DatabaseIO.C
index d368bd2ff93d..d84be0925268 100644
--- a/packages/seacas/libraries/ioss/src/exodus/Ioex_DatabaseIO.C
+++ b/packages/seacas/libraries/ioss/src/exodus/Ioex_DatabaseIO.C
@@ -684,6 +684,7 @@ namespace Ioex {
       {
         Ioss::SerializeIO serializeIO_(this);
         m_timestepCount = ex_inquire_int(get_file_pointer(), EX_INQ_TIME);
+      }
 	// Need to sync timestep count across ranks if parallel...
 	if (isParallel) {
 	  auto min_timestep_count = util().global_minmax(m_timestepCount, Ioss::ParallelUtils::DO_MIN);
@@ -725,6 +726,7 @@ namespace Ioex {
         Ioss::Utils::check_set_bool_property(properties, "EXODUS_CALL_GET_ALL_TIMES",
                                              call_ex_get_all_times);
         if (call_ex_get_all_times) {
+	  Ioss::SerializeIO serializeIO_(this);
           int error = ex_get_all_times(get_file_pointer(), Data(tsteps));
           if (error < 0) {
             Ioex::exodus_error(get_file_pointer(), __LINE__, __func__, __FILE__);
@@ -733,8 +735,11 @@ namespace Ioex {
 
         // See if the "last_written_time" attribute exists and if it
         // does, check that it matches the largest time in 'tsteps'.
-        exists = Ioex::read_last_time_attribute(get_file_pointer(), &last_time);
-      }
+	{
+	  Ioss::SerializeIO serializeIO_(this);
+	  exists = Ioex::read_last_time_attribute(get_file_pointer(), &last_time);
+	}
+
       if (exists && isParallel) {
         // Assume that if it exists on 1 processor, it exists on
         // all... Sync value among processors since could have a
diff --git a/packages/seacas/libraries/ioss/src/main/cgns_decomp.C b/packages/seacas/libraries/ioss/src/main/cgns_decomp.C
index 850afe05fb49..2ef43f03c6b6 100644
--- a/packages/seacas/libraries/ioss/src/main/cgns_decomp.C
+++ b/packages/seacas/libraries/ioss/src/main/cgns_decomp.C
@@ -633,13 +633,13 @@ namespace {
           int star_cnt =
               (double)(proc_work[i] - min_work) / (max_work - min_work) * delta + min_star;
           std::string stars(star_cnt, '*');
-          std::string format = "\tProcessor {:{}}, work = {:{}}  ({:.2f})\t{}\n";
+          const std::string format = "\tProcessor {:{}}, work = {:{}}  ({:.2f})\t{}\n";
           if (proc_work[i] == max_work) {
             fmt::print(
 #if !defined __NVCC__
                 fg(fmt::color::red),
 #endif
-                format, i, proc_width, fmt::group_digits(proc_work[i]), work_width,
+                fmt::runtime(format), i, proc_width, fmt::group_digits(proc_work[i]), work_width,
                 proc_work[i] / avg_work, stars);
           }
           else if (proc_work[i] == min_work) {
@@ -647,11 +647,11 @@ namespace {
 #if !defined __NVCC__
                 fg(fmt::color::green),
 #endif
-                format, i, proc_width, fmt::group_digits(proc_work[i]), work_width,
+                fmt::runtime(format), i, proc_width, fmt::group_digits(proc_work[i]), work_width,
                 proc_work[i] / avg_work, stars);
           }
           else {
-            fmt::print(format, i, proc_width, fmt::group_digits(proc_work[i]), work_width,
+            fmt::print(fmt::runtime(format), i, proc_width, fmt::group_digits(proc_work[i]), work_width,
                        proc_work[i] / avg_work, stars);
           }
           if (verbose) {
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/args.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/args.h
index a3966d140719..31a60e8faf1a 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/args.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/args.h
@@ -1,4 +1,4 @@
-// Formatting library for C++ - dynamic format arguments
+// Formatting library for C++ - dynamic argument lists
 //
 // Copyright (c) 2012 - present, Victor Zverovich
 // All rights reserved.
@@ -8,11 +8,13 @@
 #ifndef FMT_ARGS_H_
 #define FMT_ARGS_H_
 
-#include <functional>  // std::reference_wrapper
-#include <memory>      // std::unique_ptr
-#include <vector>
+#ifndef FMT_MODULE
+#  include <functional>  // std::reference_wrapper
+#  include <memory>      // std::unique_ptr
+#  include <vector>
+#endif
 
-#include "core.h"
+#include "format.h"  // std_string_view
 
 FMT_BEGIN_NAMESPACE
 
@@ -22,20 +24,24 @@ template <typename T> struct is_reference_wrapper : std::false_type {};
 template <typename T>
 struct is_reference_wrapper<std::reference_wrapper<T>> : std::true_type {};
 
-template <typename T> const T& unwrap(const T& v) { return v; }
-template <typename T> const T& unwrap(const std::reference_wrapper<T>& v) {
+template <typename T> auto unwrap(const T& v) -> const T& { return v; }
+template <typename T>
+auto unwrap(const std::reference_wrapper<T>& v) -> const T& {
   return static_cast<const T&>(v);
 }
 
-class dynamic_arg_list {
-  // Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
-  // templates it doesn't complain about inability to deduce single translation
-  // unit for placing vtable. So storage_node_base is made a fake template.
-  template <typename = void> struct node {
-    virtual ~node() = default;
-    std::unique_ptr<node<>> next;
-  };
+// node is defined outside dynamic_arg_list to workaround a C2504 bug in MSVC
+// 2022 (v17.10.0).
+//
+// Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
+// templates it doesn't complain about inability to deduce single translation
+// unit for placing vtable. So node is made a fake template.
+template <typename = void> struct node {
+  virtual ~node() = default;
+  std::unique_ptr<node<>> next;
+};
 
+class dynamic_arg_list {
   template <typename T> struct typed_node : node<> {
     T value;
 
@@ -50,7 +56,7 @@ class dynamic_arg_list {
   std::unique_ptr<node<>> head_;
 
  public:
-  template <typename T, typename Arg> const T& push(const Arg& arg) {
+  template <typename T, typename Arg> auto push(const Arg& arg) -> const T& {
     auto new_node = std::unique_ptr<typed_node<T>>(new typed_node<T>(arg));
     auto& value = new_node->value;
     new_node->next = std::move(head_);
@@ -61,14 +67,10 @@ class dynamic_arg_list {
 }  // namespace detail
 
 /**
-  \rst
-  A dynamic version of `fmt::format_arg_store`.
-  It's equipped with a storage to potentially temporary objects which lifetimes
-  could be shorter than the format arguments object.
-
-  It can be implicitly converted into `~fmt::basic_format_args` for passing
-  into type-erased formatting functions such as `~fmt::vformat`.
-  \endrst
+ * A dynamic list of formatting arguments with storage.
+ *
+ * It can be implicitly converted into `fmt::basic_format_args` for passing
+ * into type-erased formatting functions such as `fmt::vformat`.
  */
 template <typename Context>
 class dynamic_format_arg_store
@@ -110,14 +112,14 @@ class dynamic_format_arg_store
 
   friend class basic_format_args<Context>;
 
-  unsigned long long get_types() const {
+  auto get_types() const -> unsigned long long {
     return detail::is_unpacked_bit | data_.size() |
            (named_info_.empty()
                 ? 0ULL
                 : static_cast<unsigned long long>(detail::has_named_args_bit));
   }
 
-  const basic_format_arg<Context>* data() const {
+  auto data() const -> const basic_format_arg<Context>* {
     return named_info_.empty() ? data_.data() : data_.data() + 1;
   }
 
@@ -146,22 +148,20 @@ class dynamic_format_arg_store
   constexpr dynamic_format_arg_store() = default;
 
   /**
-    \rst
-    Adds an argument into the dynamic store for later passing to a formatting
-    function.
-
-    Note that custom types and string types (but not string views) are copied
-    into the store dynamically allocating memory if necessary.
-
-    **Example**::
-
-      fmt::dynamic_format_arg_store<fmt::format_context> store;
-      store.push_back(42);
-      store.push_back("abc");
-      store.push_back(1.5f);
-      std::string result = fmt::vformat("{} and {} and {}", store);
-    \endrst
-  */
+   * Adds an argument into the dynamic store for later passing to a formatting
+   * function.
+   *
+   * Note that custom types and string types (but not string views) are copied
+   * into the store dynamically allocating memory if necessary.
+   *
+   * **Example**:
+   *
+   *     fmt::dynamic_format_arg_store<fmt::format_context> store;
+   *     store.push_back(42);
+   *     store.push_back("abc");
+   *     store.push_back(1.5f);
+   *     std::string result = fmt::vformat("{} and {} and {}", store);
+   */
   template <typename T> void push_back(const T& arg) {
     if (detail::const_check(need_copy<T>::value))
       emplace_arg(dynamic_args_.push<stored_type<T>>(arg));
@@ -170,20 +170,18 @@ class dynamic_format_arg_store
   }
 
   /**
-    \rst
-    Adds a reference to the argument into the dynamic store for later passing to
-    a formatting function.
-
-    **Example**::
-
-      fmt::dynamic_format_arg_store<fmt::format_context> store;
-      char band[] = "Rolling Stones";
-      store.push_back(std::cref(band));
-      band[9] = 'c'; // Changing str affects the output.
-      std::string result = fmt::vformat("{}", store);
-      // result == "Rolling Scones"
-    \endrst
-  */
+   * Adds a reference to the argument into the dynamic store for later passing
+   * to a formatting function.
+   *
+   * **Example**:
+   *
+   *     fmt::dynamic_format_arg_store<fmt::format_context> store;
+   *     char band[] = "Rolling Stones";
+   *     store.push_back(std::cref(band));
+   *     band[9] = 'c'; // Changing str affects the output.
+   *     std::string result = fmt::vformat("{}", store);
+   *     // result == "Rolling Scones"
+   */
   template <typename T> void push_back(std::reference_wrapper<T> arg) {
     static_assert(
         need_copy<T>::value,
@@ -192,10 +190,10 @@ class dynamic_format_arg_store
   }
 
   /**
-    Adds named argument into the dynamic store for later passing to a formatting
-    function. ``std::reference_wrapper`` is supported to avoid copying of the
-    argument. The name is always copied into the store.
-  */
+   * Adds named argument into the dynamic store for later passing to a
+   * formatting function. `std::reference_wrapper` is supported to avoid
+   * copying of the argument. The name is always copied into the store.
+   */
   template <typename T>
   void push_back(const detail::named_arg<char_type, T>& arg) {
     const char_type* arg_name =
@@ -208,19 +206,15 @@ class dynamic_format_arg_store
     }
   }
 
-  /** Erase all elements from the store */
+  /// Erase all elements from the store.
   void clear() {
     data_.clear();
     named_info_.clear();
     dynamic_args_ = detail::dynamic_arg_list();
   }
 
-  /**
-    \rst
-    Reserves space to store at least *new_cap* arguments including
-    *new_cap_named* named arguments.
-    \endrst
-  */
+  /// Reserves space to store at least `new_cap` arguments including
+  /// `new_cap_named` named arguments.
   void reserve(size_t new_cap, size_t new_cap_named) {
     FMT_ASSERT(new_cap >= new_cap_named,
                "Set of arguments includes set of named arguments");
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/base.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/base.h
new file mode 100644
index 000000000000..e1568b040c8a
--- /dev/null
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/base.h
@@ -0,0 +1,3078 @@
+// Formatting library for C++ - the base API for char/UTF-8
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_BASE_H_
+#define FMT_BASE_H_
+
+#if defined(FMT_IMPORT_STD) && !defined(FMT_MODULE)
+#  define FMT_MODULE
+#endif
+
+#ifndef FMT_MODULE
+#  include <limits.h>  // CHAR_BIT
+#  include <stdio.h>   // FILE
+#  include <string.h>  // strlen
+
+// <cstddef> is also included transitively from <type_traits>.
+#  include <cstddef>      // std::byte
+#  include <type_traits>  // std::enable_if
+#endif
+
+// The fmt library version in the form major * 10000 + minor * 100 + patch.
+#define FMT_VERSION 110002
+#define FMT_HEADER_ONLY
+
+// Detect compiler versions.
+#if defined(__clang__) && !defined(__ibmxl__)
+#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+#else
+#  define FMT_CLANG_VERSION 0
+#endif
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#  define FMT_GCC_VERSION 0
+#endif
+#if defined(__ICL)
+#  define FMT_ICC_VERSION __ICL
+#elif defined(__INTEL_COMPILER)
+#  define FMT_ICC_VERSION __INTEL_COMPILER
+#else
+#  define FMT_ICC_VERSION 0
+#endif
+#if defined(_MSC_VER)
+#  define FMT_MSC_VERSION _MSC_VER
+#else
+#  define FMT_MSC_VERSION 0
+#endif
+
+// Detect standard library versions.
+#ifdef _GLIBCXX_RELEASE
+#  define FMT_GLIBCXX_RELEASE _GLIBCXX_RELEASE
+#else
+#  define FMT_GLIBCXX_RELEASE 0
+#endif
+#ifdef _LIBCPP_VERSION
+#  define FMT_LIBCPP_VERSION _LIBCPP_VERSION
+#else
+#  define FMT_LIBCPP_VERSION 0
+#endif
+
+#ifdef _MSVC_LANG
+#  define FMT_CPLUSPLUS _MSVC_LANG
+#else
+#  define FMT_CPLUSPLUS __cplusplus
+#endif
+
+// Detect __has_*.
+#ifdef __has_feature
+#  define FMT_HAS_FEATURE(x) __has_feature(x)
+#else
+#  define FMT_HAS_FEATURE(x) 0
+#endif
+#ifdef __has_include
+#  define FMT_HAS_INCLUDE(x) __has_include(x)
+#else
+#  define FMT_HAS_INCLUDE(x) 0
+#endif
+#ifdef __has_cpp_attribute
+#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+// Detect C++14 relaxed constexpr.
+#ifdef FMT_USE_CONSTEXPR
+// Use the provided definition.
+#elif FMT_GCC_VERSION >= 600 && FMT_CPLUSPLUS >= 201402L
+// GCC only allows throw in constexpr since version 6:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67371.
+#  define FMT_USE_CONSTEXPR 1
+#elif FMT_ICC_VERSION
+#  define FMT_USE_CONSTEXPR 0  // https://github.com/fmtlib/fmt/issues/1628
+#elif FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912
+#  define FMT_USE_CONSTEXPR 1
+#else
+#  define FMT_USE_CONSTEXPR 0
+#endif
+#if FMT_USE_CONSTEXPR
+#  define FMT_CONSTEXPR constexpr
+#else
+#  define FMT_CONSTEXPR
+#endif
+
+// Detect consteval, C++20 constexpr extensions and std::is_constant_evaluated.
+#if !defined(__cpp_lib_is_constant_evaluated)
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_CPLUSPLUS < 201709L
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_GLIBCXX_RELEASE && FMT_GLIBCXX_RELEASE < 10
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_LIBCPP_VERSION && FMT_LIBCPP_VERSION < 10000
+#  define FMT_USE_CONSTEVAL 0
+#elif defined(__apple_build_version__) && __apple_build_version__ < 14000029L
+#  define FMT_USE_CONSTEVAL 0  // consteval is broken in Apple clang < 14.
+#elif FMT_MSC_VERSION && FMT_MSC_VERSION < 1929
+#  define FMT_USE_CONSTEVAL 0  // consteval is broken in MSVC VS2019 < 16.10.
+#elif defined(__cpp_consteval)
+#  define FMT_USE_CONSTEVAL 1
+#elif FMT_GCC_VERSION >= 1002 || FMT_CLANG_VERSION >= 1101
+#  define FMT_USE_CONSTEVAL 1
+#else
+#  define FMT_USE_CONSTEVAL 0
+#endif
+#if FMT_USE_CONSTEVAL
+#  define FMT_CONSTEVAL consteval
+#  define FMT_CONSTEXPR20 constexpr
+#else
+#  define FMT_CONSTEVAL
+#  define FMT_CONSTEXPR20
+#endif
+
+#if defined(FMT_USE_NONTYPE_TEMPLATE_ARGS)
+// Use the provided definition.
+#elif defined(__NVCOMPILER)
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#elif FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#elif defined(__cpp_nontype_template_args) && \
+    __cpp_nontype_template_args >= 201911L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#elif FMT_CLANG_VERSION >= 1200 && FMT_CPLUSPLUS >= 202002L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#else
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#endif
+
+#ifdef FMT_USE_CONCEPTS
+// Use the provided definition.
+#elif defined(__cpp_concepts)
+#  define FMT_USE_CONCEPTS 1
+#else
+#  define FMT_USE_CONCEPTS 0
+#endif
+
+// Check if exceptions are disabled.
+#ifdef FMT_EXCEPTIONS
+// Use the provided definition.
+#elif defined(__GNUC__) && !defined(__EXCEPTIONS)
+#  define FMT_EXCEPTIONS 0
+#elif FMT_MSC_VERSION && !_HAS_EXCEPTIONS
+#  define FMT_EXCEPTIONS 0
+#else
+#  define FMT_EXCEPTIONS 1
+#endif
+#if FMT_EXCEPTIONS
+#  define FMT_TRY try
+#  define FMT_CATCH(x) catch (x)
+#else
+#  define FMT_TRY if (true)
+#  define FMT_CATCH(x) if (false)
+#endif
+
+#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
+#  define FMT_FALLTHROUGH [[fallthrough]]
+#elif defined(__clang__)
+#  define FMT_FALLTHROUGH [[clang::fallthrough]]
+#elif FMT_GCC_VERSION >= 700 && \
+    (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
+#  define FMT_FALLTHROUGH [[gnu::fallthrough]]
+#else
+#  define FMT_FALLTHROUGH
+#endif
+
+// Disable [[noreturn]] on MSVC/NVCC because of bogus unreachable code warnings.
+#if FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION && !defined(__NVCC__)
+#  define FMT_NORETURN [[noreturn]]
+#else
+#  define FMT_NORETURN
+#endif
+
+#ifndef FMT_NODISCARD
+#  if FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
+#    define FMT_NODISCARD [[nodiscard]]
+#  else
+#    define FMT_NODISCARD
+#  endif
+#endif
+
+#ifdef FMT_DEPRECATED
+// Use the provided definition.
+#elif FMT_HAS_CPP14_ATTRIBUTE(deprecated)
+#  define FMT_DEPRECATED [[deprecated]]
+#else
+#  define FMT_DEPRECATED /* deprecated */
+#endif
+
+#ifdef FMT_INLINE
+// Use the provided definition.
+#elif FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#  define FMT_ALWAYS_INLINE inline
+#endif
+// A version of FMT_INLINE to prevent code bloat in debug mode.
+#ifdef NDEBUG
+#  define FMT_INLINE FMT_ALWAYS_INLINE
+#else
+#  define FMT_INLINE inline
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_VISIBILITY(value) __attribute__((visibility(value)))
+#else
+#  define FMT_VISIBILITY(value)
+#endif
+
+#ifndef FMT_GCC_PRAGMA
+// Workaround a _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884
+// and an nvhpc warning: https://github.com/fmtlib/fmt/pull/2582.
+#  if FMT_GCC_VERSION >= 504 && !defined(__NVCOMPILER)
+#    define FMT_GCC_PRAGMA(arg) _Pragma(arg)
+#  else
+#    define FMT_GCC_PRAGMA(arg)
+#  endif
+#endif
+
+// GCC < 5 requires this-> in decltype.
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+#  define FMT_DECLTYPE_THIS this->
+#else
+#  define FMT_DECLTYPE_THIS
+#endif
+
+#if FMT_MSC_VERSION
+#  define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
+#  define FMT_UNCHECKED_ITERATOR(It) \
+    using _Unchecked_type = It  // Mark iterator as checked.
+#else
+#  define FMT_MSC_WARNING(...)
+#  define FMT_UNCHECKED_ITERATOR(It) using unchecked_type = It
+#endif
+
+#ifndef FMT_BEGIN_NAMESPACE
+#  define FMT_BEGIN_NAMESPACE \
+    namespace fmt {           \
+    inline namespace v11 {
+#  define FMT_END_NAMESPACE \
+    }                       \
+    }
+#endif
+
+#ifndef FMT_EXPORT
+#  define FMT_EXPORT
+#  define FMT_BEGIN_EXPORT
+#  define FMT_END_EXPORT
+#endif
+
+#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
+#  if defined(FMT_LIB_EXPORT)
+#    define FMT_API __declspec(dllexport)
+#  elif defined(FMT_SHARED)
+#    define FMT_API __declspec(dllimport)
+#  endif
+#elif defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_API FMT_VISIBILITY("default")
+#endif
+#ifndef FMT_API
+#  define FMT_API
+#endif
+
+#ifndef FMT_UNICODE
+#  define FMT_UNICODE 1
+#endif
+
+// Check if rtti is available.
+#ifndef FMT_USE_RTTI
+// __RTTI is for EDG compilers. _CPPRTTI is for MSVC.
+#  if defined(__GXX_RTTI) || FMT_HAS_FEATURE(cxx_rtti) || defined(_CPPRTTI) || \
+      defined(__INTEL_RTTI__) || defined(__RTTI)
+#    define FMT_USE_RTTI 1
+#  else
+#    define FMT_USE_RTTI 0
+#  endif
+#endif
+
+#define FMT_FWD(...) static_cast<decltype(__VA_ARGS__)&&>(__VA_ARGS__)
+
+// Enable minimal optimizations for more compact code in debug mode.
+FMT_GCC_PRAGMA("GCC push_options")
+#if !defined(__OPTIMIZE__) && !defined(__CUDACC__)
+FMT_GCC_PRAGMA("GCC optimize(\"Og\")")
+#endif
+
+FMT_BEGIN_NAMESPACE
+
+// Implementations of enable_if_t and other metafunctions for older systems.
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <typename T>
+using remove_const_t = typename std::remove_const<T>::type;
+template <typename T>
+using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
+template <typename T> struct type_identity {
+  using type = T;
+};
+template <typename T> using type_identity_t = typename type_identity<T>::type;
+template <typename T>
+using make_unsigned_t = typename std::make_unsigned<T>::type;
+template <typename T>
+using underlying_t = typename std::underlying_type<T>::type;
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
+template <typename...> struct void_t_impl {
+  using type = void;
+};
+template <typename... T> using void_t = typename void_t_impl<T...>::type;
+#else
+template <typename...> using void_t = void;
+#endif
+
+struct monostate {
+  constexpr monostate() {}
+};
+
+// An enable_if helper to be used in template parameters which results in much
+// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
+// to workaround a bug in MSVC 2019 (see #1140 and #1186).
+#ifdef FMT_DOC
+#  define FMT_ENABLE_IF(...)
+#else
+#  define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0
+#endif
+
+// This is defined in base.h instead of format.h to avoid injecting in std.
+// It is a template to avoid undesirable implicit conversions to std::byte.
+#ifdef __cpp_lib_byte
+template <typename T, FMT_ENABLE_IF(std::is_same<T, std::byte>::value)>
+inline auto format_as(T b) -> unsigned char {
+  return static_cast<unsigned char>(b);
+}
+#endif
+
+namespace detail {
+// Suppresses "unused variable" warnings with the method described in
+// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
+// (void)var does not work on many Intel compilers.
+template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
+
+constexpr auto is_constant_evaluated(bool default_value = false) noexcept
+    -> bool {
+// Workaround for incompatibility between libstdc++ consteval-based
+// std::is_constant_evaluated() implementation and clang-14:
+// https://github.com/fmtlib/fmt/issues/3247.
+#if FMT_CPLUSPLUS >= 202002L && FMT_GLIBCXX_RELEASE >= 12 && \
+    (FMT_CLANG_VERSION >= 1400 && FMT_CLANG_VERSION < 1500)
+  ignore_unused(default_value);
+  return __builtin_is_constant_evaluated();
+#elif defined(__cpp_lib_is_constant_evaluated)
+  ignore_unused(default_value);
+  return std::is_constant_evaluated();
+#else
+  return default_value;
+#endif
+}
+
+// Suppresses "conditional expression is constant" warnings.
+template <typename T> constexpr auto const_check(T value) -> T { return value; }
+
+FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
+                                      const char* message);
+
+#if defined(FMT_ASSERT)
+// Use the provided definition.
+#elif defined(NDEBUG)
+// FMT_ASSERT is not empty to avoid -Wempty-body.
+#  define FMT_ASSERT(condition, message) \
+    fmt::detail::ignore_unused((condition), (message))
+#else
+#  define FMT_ASSERT(condition, message)                                    \
+    ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
+         ? (void)0                                                          \
+         : fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
+#endif
+
+#ifdef FMT_USE_INT128
+// Do nothing.
+#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \
+    !(FMT_CLANG_VERSION && FMT_MSC_VERSION)
+#  define FMT_USE_INT128 1
+using int128_opt = __int128_t;  // An optional native 128-bit integer.
+using uint128_opt = __uint128_t;
+template <typename T> inline auto convert_for_visit(T value) -> T {
+  return value;
+}
+#else
+#  define FMT_USE_INT128 0
+#endif
+#if !FMT_USE_INT128
+enum class int128_opt {};
+enum class uint128_opt {};
+// Reduce template instantiations.
+template <typename T> auto convert_for_visit(T) -> monostate { return {}; }
+#endif
+
+// Casts a nonnegative integer to unsigned.
+template <typename Int>
+FMT_CONSTEXPR auto to_unsigned(Int value) -> make_unsigned_t<Int> {
+  FMT_ASSERT(std::is_unsigned<Int>::value || value >= 0, "negative value");
+  return static_cast<make_unsigned_t<Int>>(value);
+}
+
+// A heuristic to detect std::string and std::[experimental::]string_view.
+// It is mainly used to avoid dependency on <[experimental/]string_view>.
+template <typename T, typename Enable = void>
+struct is_std_string_like : std::false_type {};
+template <typename T>
+struct is_std_string_like<T, void_t<decltype(std::declval<T>().find_first_of(
+                                 typename T::value_type(), 0))>>
+    : std::is_convertible<decltype(std::declval<T>().data()),
+                          const typename T::value_type*> {};
+
+// Returns true iff the literal encoding is UTF-8.
+constexpr auto is_utf8_enabled() -> bool {
+  // Avoid an MSVC sign extension bug: https://github.com/fmtlib/fmt/pull/2297.
+  using uchar = unsigned char;
+  return sizeof("\u00A7") == 3 && uchar("\u00A7"[0]) == 0xC2 &&
+         uchar("\u00A7"[1]) == 0xA7;
+}
+constexpr auto use_utf8() -> bool {
+  return !FMT_MSC_VERSION || is_utf8_enabled();
+}
+
+static_assert(!FMT_UNICODE || use_utf8(),
+              "Unicode support requires compiling with /utf-8");
+
+template <typename Char> FMT_CONSTEXPR auto length(const Char* s) -> size_t {
+  size_t len = 0;
+  while (*s++) ++len;
+  return len;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto compare(const Char* s1, const Char* s2, std::size_t n)
+    -> int {
+  if (!is_constant_evaluated() && sizeof(Char) == 1) return memcmp(s1, s2, n);
+  for (; n != 0; ++s1, ++s2, --n) {
+    if (*s1 < *s2) return -1;
+    if (*s1 > *s2) return 1;
+  }
+  return 0;
+}
+
+namespace adl {
+using namespace std;
+
+template <typename Container>
+auto invoke_back_inserter()
+    -> decltype(back_inserter(std::declval<Container&>()));
+}  // namespace adl
+
+template <typename It, typename Enable = std::true_type>
+struct is_back_insert_iterator : std::false_type {};
+
+template <typename It>
+struct is_back_insert_iterator<
+    It, bool_constant<std::is_same<
+            decltype(adl::invoke_back_inserter<typename It::container_type>()),
+            It>::value>> : std::true_type {};
+
+// Extracts a reference to the container from *insert_iterator.
+template <typename OutputIt>
+inline auto get_container(OutputIt it) -> typename OutputIt::container_type& {
+  struct accessor : OutputIt {
+    accessor(OutputIt base) : OutputIt(base) {}
+    using OutputIt::container;
+  };
+  return *accessor(it).container;
+}
+}  // namespace detail
+
+// Checks whether T is a container with contiguous storage.
+template <typename T> struct is_contiguous : std::false_type {};
+
+/**
+ * An implementation of `std::basic_string_view` for pre-C++17. It provides a
+ * subset of the API. `fmt::basic_string_view` is used for format strings even
+ * if `std::basic_string_view` is available to prevent issues when a library is
+ * compiled with a different `-std` option than the client code (which is not
+ * recommended).
+ */
+FMT_EXPORT
+template <typename Char> class basic_string_view {
+ private:
+  const Char* data_;
+  size_t size_;
+
+ public:
+  using value_type = Char;
+  using iterator = const Char*;
+
+  constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {}
+
+  /// Constructs a string reference object from a C string and a size.
+  constexpr basic_string_view(const Char* s, size_t count) noexcept
+      : data_(s), size_(count) {}
+
+  constexpr basic_string_view(std::nullptr_t) = delete;
+
+  /// Constructs a string reference object from a C string.
+  FMT_CONSTEXPR20
+  basic_string_view(const Char* s)
+      : data_(s),
+        size_(detail::const_check(std::is_same<Char, char>::value &&
+                                  !detail::is_constant_evaluated(false))
+                  ? strlen(reinterpret_cast<const char*>(s))
+                  : detail::length(s)) {}
+
+  /// Constructs a string reference from a `std::basic_string` or a
+  /// `std::basic_string_view` object.
+  template <typename S,
+            FMT_ENABLE_IF(detail::is_std_string_like<S>::value&& std::is_same<
+                          typename S::value_type, Char>::value)>
+  FMT_CONSTEXPR basic_string_view(const S& s) noexcept
+      : data_(s.data()), size_(s.size()) {}
+
+  /// Returns a pointer to the string data.
+  constexpr auto data() const noexcept -> const Char* { return data_; }
+
+  /// Returns the string size.
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  constexpr auto begin() const noexcept -> iterator { return data_; }
+  constexpr auto end() const noexcept -> iterator { return data_ + size_; }
+
+  constexpr auto operator[](size_t pos) const noexcept -> const Char& {
+    return data_[pos];
+  }
+
+  FMT_CONSTEXPR void remove_prefix(size_t n) noexcept {
+    data_ += n;
+    size_ -= n;
+  }
+
+  FMT_CONSTEXPR auto starts_with(basic_string_view<Char> sv) const noexcept
+      -> bool {
+    return size_ >= sv.size_ && detail::compare(data_, sv.data_, sv.size_) == 0;
+  }
+  FMT_CONSTEXPR auto starts_with(Char c) const noexcept -> bool {
+    return size_ >= 1 && *data_ == c;
+  }
+  FMT_CONSTEXPR auto starts_with(const Char* s) const -> bool {
+    return starts_with(basic_string_view<Char>(s));
+  }
+
+  // Lexicographically compare this string reference to other.
+  FMT_CONSTEXPR auto compare(basic_string_view other) const -> int {
+    size_t str_size = size_ < other.size_ ? size_ : other.size_;
+    int result = detail::compare(data_, other.data_, str_size);
+    if (result == 0)
+      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
+    return result;
+  }
+
+  FMT_CONSTEXPR friend auto operator==(basic_string_view lhs,
+                                       basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) == 0;
+  }
+  friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) != 0;
+  }
+  friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) < 0;
+  }
+  friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) <= 0;
+  }
+  friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) > 0;
+  }
+  friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) >= 0;
+  }
+};
+
+FMT_EXPORT
+using string_view = basic_string_view<char>;
+
+/// Specifies if `T` is a character type. Can be specialized by users.
+FMT_EXPORT
+template <typename T> struct is_char : std::false_type {};
+template <> struct is_char<char> : std::true_type {};
+
+namespace detail {
+
+// Constructs fmt::basic_string_view<Char> from types implicitly convertible
+// to it, deducing Char. Explicitly convertible types such as the ones returned
+// from FMT_STRING are intentionally excluded.
+template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
+constexpr auto to_string_view(const Char* s) -> basic_string_view<Char> {
+  return s;
+}
+template <typename T, FMT_ENABLE_IF(is_std_string_like<T>::value)>
+constexpr auto to_string_view(const T& s)
+    -> basic_string_view<typename T::value_type> {
+  return s;
+}
+template <typename Char>
+constexpr auto to_string_view(basic_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return s;
+}
+
+template <typename T, typename Enable = void>
+struct has_to_string_view : std::false_type {};
+// detail:: is intentional since to_string_view is not an extension point.
+template <typename T>
+struct has_to_string_view<
+    T, void_t<decltype(detail::to_string_view(std::declval<T>()))>>
+    : std::true_type {};
+
+template <typename Char, Char... C> struct string_literal {
+  static constexpr Char value[sizeof...(C)] = {C...};
+  constexpr operator basic_string_view<Char>() const {
+    return {value, sizeof...(C)};
+  }
+};
+#if FMT_CPLUSPLUS < 201703L
+template <typename Char, Char... C>
+constexpr Char string_literal<Char, C...>::value[sizeof...(C)];
+#endif
+
+enum class type {
+  none_type,
+  // Integer types should go first,
+  int_type,
+  uint_type,
+  long_long_type,
+  ulong_long_type,
+  int128_type,
+  uint128_type,
+  bool_type,
+  char_type,
+  last_integer_type = char_type,
+  // followed by floating-point types.
+  float_type,
+  double_type,
+  long_double_type,
+  last_numeric_type = long_double_type,
+  cstring_type,
+  string_type,
+  pointer_type,
+  custom_type
+};
+
+// Maps core type T to the corresponding type enum constant.
+template <typename T, typename Char>
+struct type_constant : std::integral_constant<type, type::custom_type> {};
+
+#define FMT_TYPE_CONSTANT(Type, constant) \
+  template <typename Char>                \
+  struct type_constant<Type, Char>        \
+      : std::integral_constant<type, type::constant> {}
+
+FMT_TYPE_CONSTANT(int, int_type);
+FMT_TYPE_CONSTANT(unsigned, uint_type);
+FMT_TYPE_CONSTANT(long long, long_long_type);
+FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
+FMT_TYPE_CONSTANT(int128_opt, int128_type);
+FMT_TYPE_CONSTANT(uint128_opt, uint128_type);
+FMT_TYPE_CONSTANT(bool, bool_type);
+FMT_TYPE_CONSTANT(Char, char_type);
+FMT_TYPE_CONSTANT(float, float_type);
+FMT_TYPE_CONSTANT(double, double_type);
+FMT_TYPE_CONSTANT(long double, long_double_type);
+FMT_TYPE_CONSTANT(const Char*, cstring_type);
+FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
+FMT_TYPE_CONSTANT(const void*, pointer_type);
+
+constexpr auto is_integral_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_integer_type;
+}
+constexpr auto is_arithmetic_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_numeric_type;
+}
+
+constexpr auto set(type rhs) -> int { return 1 << static_cast<int>(rhs); }
+constexpr auto in(type t, int set) -> bool {
+  return ((set >> static_cast<int>(t)) & 1) != 0;
+}
+
+// Bitsets of types.
+enum {
+  sint_set =
+      set(type::int_type) | set(type::long_long_type) | set(type::int128_type),
+  uint_set = set(type::uint_type) | set(type::ulong_long_type) |
+             set(type::uint128_type),
+  bool_set = set(type::bool_type),
+  char_set = set(type::char_type),
+  float_set = set(type::float_type) | set(type::double_type) |
+              set(type::long_double_type),
+  string_set = set(type::string_type),
+  cstring_set = set(type::cstring_type),
+  pointer_set = set(type::pointer_type)
+};
+}  // namespace detail
+
+/// Reports a format error at compile time or, via a `format_error` exception,
+/// at runtime.
+// This function is intentionally not constexpr to give a compile-time error.
+FMT_NORETURN FMT_API void report_error(const char* message);
+
+FMT_DEPRECATED FMT_NORETURN inline void throw_format_error(
+    const char* message) {
+  report_error(message);
+}
+
+/// String's character (code unit) type.
+template <typename S,
+          typename V = decltype(detail::to_string_view(std::declval<S>()))>
+using char_t = typename V::value_type;
+
+/**
+ * Parsing context consisting of a format string range being parsed and an
+ * argument counter for automatic indexing.
+ * You can use the `format_parse_context` type alias for `char` instead.
+ */
+FMT_EXPORT
+template <typename Char> class basic_format_parse_context {
+ private:
+  basic_string_view<Char> format_str_;
+  int next_arg_id_;
+
+  FMT_CONSTEXPR void do_check_arg_id(int id);
+
+ public:
+  using char_type = Char;
+  using iterator = const Char*;
+
+  explicit constexpr basic_format_parse_context(
+      basic_string_view<Char> format_str, int next_arg_id = 0)
+      : format_str_(format_str), next_arg_id_(next_arg_id) {}
+
+  /// Returns an iterator to the beginning of the format string range being
+  /// parsed.
+  constexpr auto begin() const noexcept -> iterator {
+    return format_str_.begin();
+  }
+
+  /// Returns an iterator past the end of the format string range being parsed.
+  constexpr auto end() const noexcept -> iterator { return format_str_.end(); }
+
+  /// Advances the begin iterator to `it`.
+  FMT_CONSTEXPR void advance_to(iterator it) {
+    format_str_.remove_prefix(detail::to_unsigned(it - begin()));
+  }
+
+  /// Reports an error if using the manual argument indexing; otherwise returns
+  /// the next argument index and switches to the automatic indexing.
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    if (next_arg_id_ < 0) {
+      report_error("cannot switch from manual to automatic argument indexing");
+      return 0;
+    }
+    int id = next_arg_id_++;
+    do_check_arg_id(id);
+    return id;
+  }
+
+  /// Reports an error if using the automatic argument indexing; otherwise
+  /// switches to the manual indexing.
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    if (next_arg_id_ > 0) {
+      report_error("cannot switch from automatic to manual argument indexing");
+      return;
+    }
+    next_arg_id_ = -1;
+    do_check_arg_id(id);
+  }
+  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {
+    next_arg_id_ = -1;
+  }
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id);
+};
+
+FMT_EXPORT
+using format_parse_context = basic_format_parse_context<char>;
+
+namespace detail {
+// A parse context with extra data used only in compile-time checks.
+template <typename Char>
+class compile_parse_context : public basic_format_parse_context<Char> {
+ private:
+  int num_args_;
+  const type* types_;
+  using base = basic_format_parse_context<Char>;
+
+ public:
+  explicit FMT_CONSTEXPR compile_parse_context(
+      basic_string_view<Char> format_str, int num_args, const type* types,
+      int next_arg_id = 0)
+      : base(format_str, next_arg_id), num_args_(num_args), types_(types) {}
+
+  constexpr auto num_args() const -> int { return num_args_; }
+  constexpr auto arg_type(int id) const -> type { return types_[id]; }
+
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    int id = base::next_arg_id();
+    if (id >= num_args_) report_error("argument not found");
+    return id;
+  }
+
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    base::check_arg_id(id);
+    if (id >= num_args_) report_error("argument not found");
+  }
+  using base::check_arg_id;
+
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id) {
+    detail::ignore_unused(arg_id);
+    if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id]))
+      report_error("width/precision is not integer");
+  }
+};
+
+/// A contiguous memory buffer with an optional growing ability. It is an
+/// internal class and shouldn't be used directly, only via `memory_buffer`.
+template <typename T> class buffer {
+ private:
+  T* ptr_;
+  size_t size_;
+  size_t capacity_;
+
+  using grow_fun = void (*)(buffer& buf, size_t capacity);
+  grow_fun grow_;
+
+ protected:
+  // Don't initialize ptr_ since it is not accessed to save a few cycles.
+  FMT_MSC_WARNING(suppress : 26495)
+  FMT_CONSTEXPR20 buffer(grow_fun grow, size_t sz) noexcept
+      : size_(sz), capacity_(sz), grow_(grow) {}
+
+  constexpr buffer(grow_fun grow, T* p = nullptr, size_t sz = 0,
+                   size_t cap = 0) noexcept
+      : ptr_(p), size_(sz), capacity_(cap), grow_(grow) {}
+
+  FMT_CONSTEXPR20 ~buffer() = default;
+  buffer(buffer&&) = default;
+
+  /// Sets the buffer data and capacity.
+  FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) noexcept {
+    ptr_ = buf_data;
+    capacity_ = buf_capacity;
+  }
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  buffer(const buffer&) = delete;
+  void operator=(const buffer&) = delete;
+
+  auto begin() noexcept -> T* { return ptr_; }
+  auto end() noexcept -> T* { return ptr_ + size_; }
+
+  auto begin() const noexcept -> const T* { return ptr_; }
+  auto end() const noexcept -> const T* { return ptr_ + size_; }
+
+  /// Returns the size of this buffer.
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  /// Returns the capacity of this buffer.
+  constexpr auto capacity() const noexcept -> size_t { return capacity_; }
+
+  /// Returns a pointer to the buffer data (not null-terminated).
+  FMT_CONSTEXPR auto data() noexcept -> T* { return ptr_; }
+  FMT_CONSTEXPR auto data() const noexcept -> const T* { return ptr_; }
+
+  /// Clears this buffer.
+  void clear() { size_ = 0; }
+
+  // Tries resizing the buffer to contain `count` elements. If T is a POD type
+  // the new elements may not be initialized.
+  FMT_CONSTEXPR void try_resize(size_t count) {
+    try_reserve(count);
+    size_ = count <= capacity_ ? count : capacity_;
+  }
+
+  // Tries increasing the buffer capacity to `new_capacity`. It can increase the
+  // capacity by a smaller amount than requested but guarantees there is space
+  // for at least one additional element either by increasing the capacity or by
+  // flushing the buffer if it is full.
+  FMT_CONSTEXPR void try_reserve(size_t new_capacity) {
+    if (new_capacity > capacity_) grow_(*this, new_capacity);
+  }
+
+  FMT_CONSTEXPR void push_back(const T& value) {
+    try_reserve(size_ + 1);
+    ptr_[size_++] = value;
+  }
+
+  /// Appends data to the end of the buffer.
+  template <typename U> void append(const U* begin, const U* end) {
+    while (begin != end) {
+      auto count = to_unsigned(end - begin);
+      try_reserve(size_ + count);
+      auto free_cap = capacity_ - size_;
+      if (free_cap < count) count = free_cap;
+      // A loop is faster than memcpy on small sizes.
+      T* out = ptr_ + size_;
+      for (size_t i = 0; i < count; ++i) out[i] = begin[i];
+      size_ += count;
+      begin += count;
+    }
+  }
+
+  template <typename Idx> FMT_CONSTEXPR auto operator[](Idx index) -> T& {
+    return ptr_[index];
+  }
+  template <typename Idx>
+  FMT_CONSTEXPR auto operator[](Idx index) const -> const T& {
+    return ptr_[index];
+  }
+};
+
+struct buffer_traits {
+  explicit buffer_traits(size_t) {}
+  auto count() const -> size_t { return 0; }
+  auto limit(size_t size) -> size_t { return size; }
+};
+
+class fixed_buffer_traits {
+ private:
+  size_t count_ = 0;
+  size_t limit_;
+
+ public:
+  explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
+  auto count() const -> size_t { return count_; }
+  auto limit(size_t size) -> size_t {
+    size_t n = limit_ > count_ ? limit_ - count_ : 0;
+    count_ += size;
+    return size < n ? size : n;
+  }
+};
+
+// A buffer that writes to an output iterator when flushed.
+template <typename OutputIt, typename T, typename Traits = buffer_traits>
+class iterator_buffer : public Traits, public buffer<T> {
+ private:
+  OutputIt out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buffer_size) static_cast<iterator_buffer&>(buf).flush();
+  }
+
+  void flush() {
+    auto size = this->size();
+    this->clear();
+    const T* begin = data_;
+    const T* end = begin + this->limit(size);
+    while (begin != end) *out_++ = *begin++;
+  }
+
+ public:
+  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
+      : Traits(n), buffer<T>(grow, data_, 0, buffer_size), out_(out) {}
+  iterator_buffer(iterator_buffer&& other) noexcept
+      : Traits(other),
+        buffer<T>(grow, data_, 0, buffer_size),
+        out_(other.out_) {}
+  ~iterator_buffer() {
+    // Don't crash if flush fails during unwinding.
+    FMT_TRY { flush(); }
+    FMT_CATCH(...) {}
+  }
+
+  auto out() -> OutputIt {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t { return Traits::count() + this->size(); }
+};
+
+template <typename T>
+class iterator_buffer<T*, T, fixed_buffer_traits> : public fixed_buffer_traits,
+                                                    public buffer<T> {
+ private:
+  T* out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buf.capacity())
+      static_cast<iterator_buffer&>(buf).flush();
+  }
+
+  void flush() {
+    size_t n = this->limit(this->size());
+    if (this->data() == out_) {
+      out_ += n;
+      this->set(data_, buffer_size);
+    }
+    this->clear();
+  }
+
+ public:
+  explicit iterator_buffer(T* out, size_t n = buffer_size)
+      : fixed_buffer_traits(n), buffer<T>(grow, out, 0, n), out_(out) {}
+  iterator_buffer(iterator_buffer&& other) noexcept
+      : fixed_buffer_traits(other),
+        buffer<T>(static_cast<iterator_buffer&&>(other)),
+        out_(other.out_) {
+    if (this->data() != out_) {
+      this->set(data_, buffer_size);
+      this->clear();
+    }
+  }
+  ~iterator_buffer() { flush(); }
+
+  auto out() -> T* {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t {
+    return fixed_buffer_traits::count() + this->size();
+  }
+};
+
+template <typename T> class iterator_buffer<T*, T> : public buffer<T> {
+ public:
+  explicit iterator_buffer(T* out, size_t = 0)
+      : buffer<T>([](buffer<T>&, size_t) {}, out, 0, ~size_t()) {}
+
+  auto out() -> T* { return &*this->end(); }
+};
+
+// A buffer that writes to a container with the contiguous storage.
+template <typename OutputIt>
+class iterator_buffer<
+    OutputIt,
+    enable_if_t<detail::is_back_insert_iterator<OutputIt>::value &&
+                    is_contiguous<typename OutputIt::container_type>::value,
+                typename OutputIt::container_type::value_type>>
+    : public buffer<typename OutputIt::container_type::value_type> {
+ private:
+  using container_type = typename OutputIt::container_type;
+  using value_type = typename container_type::value_type;
+  container_type& container_;
+
+  static FMT_CONSTEXPR void grow(buffer<value_type>& buf, size_t capacity) {
+    auto& self = static_cast<iterator_buffer&>(buf);
+    self.container_.resize(capacity);
+    self.set(&self.container_[0], capacity);
+  }
+
+ public:
+  explicit iterator_buffer(container_type& c)
+      : buffer<value_type>(grow, c.size()), container_(c) {}
+  explicit iterator_buffer(OutputIt out, size_t = 0)
+      : iterator_buffer(get_container(out)) {}
+
+  auto out() -> OutputIt { return back_inserter(container_); }
+};
+
+// A buffer that counts the number of code units written discarding the output.
+template <typename T = char> class counting_buffer : public buffer<T> {
+ private:
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+  size_t count_ = 0;
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() != buffer_size) return;
+    static_cast<counting_buffer&>(buf).count_ += buf.size();
+    buf.clear();
+  }
+
+ public:
+  counting_buffer() : buffer<T>(grow, data_, 0, buffer_size) {}
+
+  auto count() -> size_t { return count_ + this->size(); }
+};
+}  // namespace detail
+
+template <typename Char>
+FMT_CONSTEXPR void basic_format_parse_context<Char>::do_check_arg_id(int id) {
+  // Argument id is only checked at compile-time during parsing because
+  // formatting has its own validation.
+  if (detail::is_constant_evaluated() &&
+      (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
+    using context = detail::compile_parse_context<Char>;
+    if (id >= static_cast<context*>(this)->num_args())
+      report_error("argument not found");
+  }
+}
+
+template <typename Char>
+FMT_CONSTEXPR void basic_format_parse_context<Char>::check_dynamic_spec(
+    int arg_id) {
+  if (detail::is_constant_evaluated() &&
+      (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
+    using context = detail::compile_parse_context<Char>;
+    static_cast<context*>(this)->check_dynamic_spec(arg_id);
+  }
+}
+
+FMT_EXPORT template <typename Context> class basic_format_arg;
+FMT_EXPORT template <typename Context> class basic_format_args;
+FMT_EXPORT template <typename Context> class dynamic_format_arg_store;
+
+// A formatter for objects of type T.
+FMT_EXPORT
+template <typename T, typename Char = char, typename Enable = void>
+struct formatter {
+  // A deleted default constructor indicates a disabled formatter.
+  formatter() = delete;
+};
+
+// Specifies if T has an enabled formatter specialization. A type can be
+// formattable even if it doesn't have a formatter e.g. via a conversion.
+template <typename T, typename Context>
+using has_formatter =
+    std::is_constructible<typename Context::template formatter_type<T>>;
+
+// An output iterator that appends to a buffer. It is used instead of
+// back_insert_iterator to reduce symbol sizes and avoid <iterator> dependency.
+template <typename T> class basic_appender {
+ private:
+  detail::buffer<T>* buffer_;
+
+  friend auto get_container(basic_appender app) -> detail::buffer<T>& {
+    return *app.buffer_;
+  }
+
+ public:
+  using iterator_category = int;
+  using value_type = T;
+  using difference_type = ptrdiff_t;
+  using pointer = T*;
+  using reference = T&;
+  using container_type = detail::buffer<T>;
+  FMT_UNCHECKED_ITERATOR(basic_appender);
+
+  FMT_CONSTEXPR basic_appender(detail::buffer<T>& buf) : buffer_(&buf) {}
+
+  auto operator=(T c) -> basic_appender& {
+    buffer_->push_back(c);
+    return *this;
+  }
+  auto operator*() -> basic_appender& { return *this; }
+  auto operator++() -> basic_appender& { return *this; }
+  auto operator++(int) -> basic_appender { return *this; }
+};
+
+using appender = basic_appender<char>;
+
+namespace detail {
+template <typename T>
+struct is_back_insert_iterator<basic_appender<T>> : std::true_type {};
+
+template <typename T, typename Enable = void>
+struct locking : std::true_type {};
+template <typename T>
+struct locking<T, void_t<typename formatter<remove_cvref_t<T>>::nonlocking>>
+    : std::false_type {};
+
+template <typename T = int> FMT_CONSTEXPR inline auto is_locking() -> bool {
+  return locking<T>::value;
+}
+template <typename T1, typename T2, typename... Tail>
+FMT_CONSTEXPR inline auto is_locking() -> bool {
+  return locking<T1>::value || is_locking<T2, Tail...>();
+}
+
+// An optimized version of std::copy with the output value type (T).
+template <typename T, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value)>
+auto copy(InputIt begin, InputIt end, OutputIt out) -> OutputIt {
+  get_container(out).append(begin, end);
+  return out;
+}
+
+template <typename T, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(!is_back_insert_iterator<OutputIt>::value)>
+FMT_CONSTEXPR auto copy(InputIt begin, InputIt end, OutputIt out) -> OutputIt {
+  while (begin != end) *out++ = static_cast<T>(*begin++);
+  return out;
+}
+
+template <typename T, typename V, typename OutputIt>
+FMT_CONSTEXPR auto copy(basic_string_view<V> s, OutputIt out) -> OutputIt {
+  return copy<T>(s.begin(), s.end(), out);
+}
+
+template <typename Context, typename T>
+constexpr auto has_const_formatter_impl(T*)
+    -> decltype(typename Context::template formatter_type<T>().format(
+                    std::declval<const T&>(), std::declval<Context&>()),
+                true) {
+  return true;
+}
+template <typename Context>
+constexpr auto has_const_formatter_impl(...) -> bool {
+  return false;
+}
+template <typename T, typename Context>
+constexpr auto has_const_formatter() -> bool {
+  return has_const_formatter_impl<Context>(static_cast<T*>(nullptr));
+}
+
+template <typename It, typename Enable = std::true_type>
+struct is_buffer_appender : std::false_type {};
+template <typename It>
+struct is_buffer_appender<
+    It, bool_constant<
+            is_back_insert_iterator<It>::value &&
+            std::is_base_of<buffer<typename It::container_type::value_type>,
+                            typename It::container_type>::value>>
+    : std::true_type {};
+
+// Maps an output iterator to a buffer.
+template <typename T, typename OutputIt,
+          FMT_ENABLE_IF(!is_buffer_appender<OutputIt>::value)>
+auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
+  return iterator_buffer<OutputIt, T>(out);
+}
+template <typename T, typename OutputIt,
+          FMT_ENABLE_IF(is_buffer_appender<OutputIt>::value)>
+auto get_buffer(OutputIt out) -> buffer<T>& {
+  return get_container(out);
+}
+
+template <typename Buf, typename OutputIt>
+auto get_iterator(Buf& buf, OutputIt) -> decltype(buf.out()) {
+  return buf.out();
+}
+template <typename T, typename OutputIt>
+auto get_iterator(buffer<T>&, OutputIt out) -> OutputIt {
+  return out;
+}
+
+struct view {};
+
+template <typename Char, typename T> struct named_arg : view {
+  const Char* name;
+  const T& value;
+  named_arg(const Char* n, const T& v) : name(n), value(v) {}
+};
+
+template <typename Char> struct named_arg_info {
+  const Char* name;
+  int id;
+};
+
+template <typename T> struct is_named_arg : std::false_type {};
+template <typename T> struct is_statically_named_arg : std::false_type {};
+
+template <typename T, typename Char>
+struct is_named_arg<named_arg<Char, T>> : std::true_type {};
+
+template <bool B = false> constexpr auto count() -> size_t { return B ? 1 : 0; }
+template <bool B1, bool B2, bool... Tail> constexpr auto count() -> size_t {
+  return (B1 ? 1 : 0) + count<B2, Tail...>();
+}
+
+template <typename... Args> constexpr auto count_named_args() -> size_t {
+  return count<is_named_arg<Args>::value...>();
+}
+
+template <typename... Args>
+constexpr auto count_statically_named_args() -> size_t {
+  return count<is_statically_named_arg<Args>::value...>();
+}
+
+struct unformattable {};
+struct unformattable_char : unformattable {};
+struct unformattable_pointer : unformattable {};
+
+template <typename Char> struct string_value {
+  const Char* data;
+  size_t size;
+};
+
+template <typename Char> struct named_arg_value {
+  const named_arg_info<Char>* data;
+  size_t size;
+};
+
+template <typename Context> struct custom_value {
+  using parse_context = typename Context::parse_context_type;
+  void* value;
+  void (*format)(void* arg, parse_context& parse_ctx, Context& ctx);
+};
+
+// A formatting argument value.
+template <typename Context> class value {
+ public:
+  using char_type = typename Context::char_type;
+
+  union {
+    monostate no_value;
+    int int_value;
+    unsigned uint_value;
+    long long long_long_value;
+    unsigned long long ulong_long_value;
+    int128_opt int128_value;
+    uint128_opt uint128_value;
+    bool bool_value;
+    char_type char_value;
+    float float_value;
+    double double_value;
+    long double long_double_value;
+    const void* pointer;
+    string_value<char_type> string;
+    custom_value<Context> custom;
+    named_arg_value<char_type> named_args;
+  };
+
+  constexpr FMT_ALWAYS_INLINE value() : no_value() {}
+  constexpr FMT_ALWAYS_INLINE value(int val) : int_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(unsigned val) : uint_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(long long val) : long_long_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(unsigned long long val)
+      : ulong_long_value(val) {}
+  FMT_ALWAYS_INLINE value(int128_opt val) : int128_value(val) {}
+  FMT_ALWAYS_INLINE value(uint128_opt val) : uint128_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(float val) : float_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(double val) : double_value(val) {}
+  FMT_ALWAYS_INLINE value(long double val) : long_double_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(bool val) : bool_value(val) {}
+  constexpr FMT_ALWAYS_INLINE value(char_type val) : char_value(val) {}
+  FMT_CONSTEXPR FMT_ALWAYS_INLINE value(const char_type* val) {
+    string.data = val;
+    if (is_constant_evaluated()) string.size = {};
+  }
+  FMT_CONSTEXPR FMT_ALWAYS_INLINE value(basic_string_view<char_type> val) {
+    string.data = val.data();
+    string.size = val.size();
+  }
+  FMT_ALWAYS_INLINE value(const void* val) : pointer(val) {}
+  FMT_ALWAYS_INLINE value(const named_arg_info<char_type>* args, size_t size)
+      : named_args{args, size} {}
+
+  template <typename T> FMT_CONSTEXPR20 FMT_ALWAYS_INLINE value(T& val) {
+    using value_type = remove_const_t<T>;
+    // T may overload operator& e.g. std::vector<bool>::reference in libc++.
+#if defined(__cpp_if_constexpr)
+    if constexpr (std::is_same<decltype(&val), T*>::value)
+      custom.value = const_cast<value_type*>(&val);
+#endif
+    if (!is_constant_evaluated())
+      custom.value = const_cast<char*>(&reinterpret_cast<const char&>(val));
+    // Get the formatter type through the context to allow different contexts
+    // have different extension points, e.g. `formatter<T>` for `format` and
+    // `printf_formatter<T>` for `printf`.
+    custom.format = format_custom_arg<
+        value_type, typename Context::template formatter_type<value_type>>;
+  }
+  value(unformattable);
+  value(unformattable_char);
+  value(unformattable_pointer);
+
+ private:
+  // Formats an argument of a custom type, such as a user-defined class.
+  template <typename T, typename Formatter>
+  static void format_custom_arg(void* arg,
+                                typename Context::parse_context_type& parse_ctx,
+                                Context& ctx) {
+    auto f = Formatter();
+    parse_ctx.advance_to(f.parse(parse_ctx));
+    using qualified_type =
+        conditional_t<has_const_formatter<T, Context>(), const T, T>;
+    // format must be const for compatibility with std::format and compilation.
+    const auto& cf = f;
+    ctx.advance_to(cf.format(*static_cast<qualified_type*>(arg), ctx));
+  }
+};
+
+// To minimize the number of types we need to deal with, long is translated
+// either to int or to long long depending on its size.
+enum { long_short = sizeof(long) == sizeof(int) };
+using long_type = conditional_t<long_short, int, long long>;
+using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
+
+template <typename T> struct format_as_result {
+  template <typename U,
+            FMT_ENABLE_IF(std::is_enum<U>::value || std::is_class<U>::value)>
+  static auto map(U*) -> remove_cvref_t<decltype(format_as(std::declval<U>()))>;
+  static auto map(...) -> void;
+
+  using type = decltype(map(static_cast<T*>(nullptr)));
+};
+template <typename T> using format_as_t = typename format_as_result<T>::type;
+
+template <typename T>
+struct has_format_as
+    : bool_constant<!std::is_same<format_as_t<T>, void>::value> {};
+
+#define FMT_MAP_API FMT_CONSTEXPR FMT_ALWAYS_INLINE
+
+// Maps formatting arguments to core types.
+// arg_mapper reports errors by returning unformattable instead of using
+// static_assert because it's used in the is_formattable trait.
+template <typename Context> struct arg_mapper {
+  using char_type = typename Context::char_type;
+
+  FMT_MAP_API auto map(signed char val) -> int { return val; }
+  FMT_MAP_API auto map(unsigned char val) -> unsigned { return val; }
+  FMT_MAP_API auto map(short val) -> int { return val; }
+  FMT_MAP_API auto map(unsigned short val) -> unsigned { return val; }
+  FMT_MAP_API auto map(int val) -> int { return val; }
+  FMT_MAP_API auto map(unsigned val) -> unsigned { return val; }
+  FMT_MAP_API auto map(long val) -> long_type { return val; }
+  FMT_MAP_API auto map(unsigned long val) -> ulong_type { return val; }
+  FMT_MAP_API auto map(long long val) -> long long { return val; }
+  FMT_MAP_API auto map(unsigned long long val) -> unsigned long long {
+    return val;
+  }
+  FMT_MAP_API auto map(int128_opt val) -> int128_opt { return val; }
+  FMT_MAP_API auto map(uint128_opt val) -> uint128_opt { return val; }
+  FMT_MAP_API auto map(bool val) -> bool { return val; }
+
+  template <typename T, FMT_ENABLE_IF(std::is_same<T, char>::value ||
+                                      std::is_same<T, char_type>::value)>
+  FMT_MAP_API auto map(T val) -> char_type {
+    return val;
+  }
+  template <typename T, enable_if_t<(std::is_same<T, wchar_t>::value ||
+#ifdef __cpp_char8_t
+                                     std::is_same<T, char8_t>::value ||
+#endif
+                                     std::is_same<T, char16_t>::value ||
+                                     std::is_same<T, char32_t>::value) &&
+                                        !std::is_same<T, char_type>::value,
+                                    int> = 0>
+  FMT_MAP_API auto map(T) -> unformattable_char {
+    return {};
+  }
+
+  FMT_MAP_API auto map(float val) -> float { return val; }
+  FMT_MAP_API auto map(double val) -> double { return val; }
+  FMT_MAP_API auto map(long double val) -> long double { return val; }
+
+  FMT_MAP_API auto map(char_type* val) -> const char_type* { return val; }
+  FMT_MAP_API auto map(const char_type* val) -> const char_type* { return val; }
+  template <typename T, typename Char = char_t<T>,
+            FMT_ENABLE_IF(std::is_same<Char, char_type>::value &&
+                          !std::is_pointer<T>::value)>
+  FMT_MAP_API auto map(const T& val) -> basic_string_view<Char> {
+    return to_string_view(val);
+  }
+  template <typename T, typename Char = char_t<T>,
+            FMT_ENABLE_IF(!std::is_same<Char, char_type>::value &&
+                          !std::is_pointer<T>::value)>
+  FMT_MAP_API auto map(const T&) -> unformattable_char {
+    return {};
+  }
+
+  FMT_MAP_API auto map(void* val) -> const void* { return val; }
+  FMT_MAP_API auto map(const void* val) -> const void* { return val; }
+  FMT_MAP_API auto map(volatile void* val) -> const void* {
+    return const_cast<const void*>(val);
+  }
+  FMT_MAP_API auto map(const volatile void* val) -> const void* {
+    return const_cast<const void*>(val);
+  }
+  FMT_MAP_API auto map(std::nullptr_t val) -> const void* { return val; }
+
+  // Use SFINAE instead of a const T* parameter to avoid a conflict with the
+  // array overload.
+  template <
+      typename T,
+      FMT_ENABLE_IF(
+          std::is_pointer<T>::value || std::is_member_pointer<T>::value ||
+          std::is_function<typename std::remove_pointer<T>::type>::value ||
+          (std::is_array<T>::value &&
+           !std::is_convertible<T, const char_type*>::value))>
+  FMT_CONSTEXPR auto map(const T&) -> unformattable_pointer {
+    return {};
+  }
+
+  template <typename T, std::size_t N,
+            FMT_ENABLE_IF(!std::is_same<T, wchar_t>::value)>
+  FMT_MAP_API auto map(const T (&values)[N]) -> const T (&)[N] {
+    return values;
+  }
+
+  // Only map owning types because mapping views can be unsafe.
+  template <typename T, typename U = format_as_t<T>,
+            FMT_ENABLE_IF(std::is_arithmetic<U>::value)>
+  FMT_MAP_API auto map(const T& val) -> decltype(FMT_DECLTYPE_THIS map(U())) {
+    return map(format_as(val));
+  }
+
+  template <typename T, typename U = remove_const_t<T>>
+  struct formattable : bool_constant<has_const_formatter<U, Context>() ||
+                                     (has_formatter<U, Context>::value &&
+                                      !std::is_const<T>::value)> {};
+
+  template <typename T, FMT_ENABLE_IF(formattable<T>::value)>
+  FMT_MAP_API auto do_map(T& val) -> T& {
+    return val;
+  }
+  template <typename T, FMT_ENABLE_IF(!formattable<T>::value)>
+  FMT_MAP_API auto do_map(T&) -> unformattable {
+    return {};
+  }
+
+  // is_fundamental is used to allow formatters for extended FP types.
+  template <typename T, typename U = remove_const_t<T>,
+            FMT_ENABLE_IF(
+                (std::is_class<U>::value || std::is_enum<U>::value ||
+                 std::is_union<U>::value || std::is_fundamental<U>::value) &&
+                !has_to_string_view<U>::value && !is_char<U>::value &&
+                !is_named_arg<U>::value && !std::is_integral<U>::value &&
+                !std::is_arithmetic<format_as_t<U>>::value)>
+  FMT_MAP_API auto map(T& val) -> decltype(FMT_DECLTYPE_THIS do_map(val)) {
+    return do_map(val);
+  }
+
+  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+  FMT_MAP_API auto map(const T& named_arg)
+      -> decltype(FMT_DECLTYPE_THIS map(named_arg.value)) {
+    return map(named_arg.value);
+  }
+
+  auto map(...) -> unformattable { return {}; }
+};
+
+// A type constant after applying arg_mapper<Context>.
+template <typename T, typename Context>
+using mapped_type_constant =
+    type_constant<decltype(arg_mapper<Context>().map(std::declval<const T&>())),
+                  typename Context::char_type>;
+
+enum { packed_arg_bits = 4 };
+// Maximum number of arguments with packed types.
+enum { max_packed_args = 62 / packed_arg_bits };
+enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
+enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
+
+template <typename It, typename T, typename Enable = void>
+struct is_output_iterator : std::false_type {};
+
+template <> struct is_output_iterator<appender, char> : std::true_type {};
+
+template <typename It, typename T>
+struct is_output_iterator<
+    It, T, void_t<decltype(*std::declval<It&>()++ = std::declval<T>())>>
+    : std::true_type {};
+
+// A type-erased reference to an std::locale to avoid a heavy <locale> include.
+class locale_ref {
+ private:
+  const void* locale_;  // A type-erased pointer to std::locale.
+
+ public:
+  constexpr locale_ref() : locale_(nullptr) {}
+  template <typename Locale> explicit locale_ref(const Locale& loc);
+
+  explicit operator bool() const noexcept { return locale_ != nullptr; }
+
+  template <typename Locale> auto get() const -> Locale;
+};
+
+template <typename> constexpr auto encode_types() -> unsigned long long {
+  return 0;
+}
+
+template <typename Context, typename Arg, typename... Args>
+constexpr auto encode_types() -> unsigned long long {
+  return static_cast<unsigned>(mapped_type_constant<Arg, Context>::value) |
+         (encode_types<Context, Args...>() << packed_arg_bits);
+}
+
+template <typename Context, typename... T, size_t NUM_ARGS = sizeof...(T)>
+constexpr unsigned long long make_descriptor() {
+  return NUM_ARGS <= max_packed_args ? encode_types<Context, T...>()
+                                     : is_unpacked_bit | NUM_ARGS;
+}
+
+// This type is intentionally undefined, only used for errors.
+template <typename T, typename Char>
+#if FMT_CLANG_VERSION && FMT_CLANG_VERSION <= 1500
+// https://github.com/fmtlib/fmt/issues/3796
+struct type_is_unformattable_for {
+};
+#else
+struct type_is_unformattable_for;
+#endif
+
+template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(PACKED)>
+FMT_CONSTEXPR auto make_arg(T& val) -> value<Context> {
+  using arg_type = remove_cvref_t<decltype(arg_mapper<Context>().map(val))>;
+
+  // Use enum instead of constexpr because the latter may generate code.
+  enum {
+    formattable_char = !std::is_same<arg_type, unformattable_char>::value
+  };
+  static_assert(formattable_char, "Mixing character types is disallowed.");
+
+  // Formatting of arbitrary pointers is disallowed. If you want to format a
+  // pointer cast it to `void*` or `const void*`. In particular, this forbids
+  // formatting of `[const] volatile char*` printed as bool by iostreams.
+  enum {
+    formattable_pointer = !std::is_same<arg_type, unformattable_pointer>::value
+  };
+  static_assert(formattable_pointer,
+                "Formatting of non-void pointers is disallowed.");
+
+  enum { formattable = !std::is_same<arg_type, unformattable>::value };
+#if defined(__cpp_if_constexpr)
+  if constexpr (!formattable)
+    type_is_unformattable_for<T, typename Context::char_type> _;
+#endif
+  static_assert(
+      formattable,
+      "Cannot format an argument. To make type T formattable provide a "
+      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
+  return {arg_mapper<Context>().map(val)};
+}
+
+template <typename Context, typename T>
+FMT_CONSTEXPR auto make_arg(T& val) -> basic_format_arg<Context> {
+  auto arg = basic_format_arg<Context>();
+  arg.type_ = mapped_type_constant<T, Context>::value;
+  arg.value_ = make_arg<true, Context>(val);
+  return arg;
+}
+
+template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(!PACKED)>
+FMT_CONSTEXPR inline auto make_arg(T& val) -> basic_format_arg<Context> {
+  return make_arg<Context>(val);
+}
+
+template <typename Context, size_t NUM_ARGS>
+using arg_t = conditional_t<NUM_ARGS <= max_packed_args, value<Context>,
+                            basic_format_arg<Context>>;
+
+template <typename Char, typename T, FMT_ENABLE_IF(!is_named_arg<T>::value)>
+void init_named_arg(named_arg_info<Char>*, int& arg_index, int&, const T&) {
+  ++arg_index;
+}
+template <typename Char, typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+void init_named_arg(named_arg_info<Char>* named_args, int& arg_index,
+                    int& named_arg_index, const T& arg) {
+  named_args[named_arg_index++] = {arg.name, arg_index++};
+}
+
+// An array of references to arguments. It can be implicitly converted to
+// `fmt::basic_format_args` for passing into type-erased formatting functions
+// such as `fmt::vformat`.
+template <typename Context, size_t NUM_ARGS, size_t NUM_NAMED_ARGS,
+          unsigned long long DESC>
+struct format_arg_store {
+  // args_[0].named_args points to named_args to avoid bloating format_args.
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  static constexpr size_t ARGS_ARR_SIZE = 1 + (NUM_ARGS != 0 ? NUM_ARGS : +1);
+
+  arg_t<Context, NUM_ARGS> args[ARGS_ARR_SIZE];
+  named_arg_info<typename Context::char_type> named_args[NUM_NAMED_ARGS];
+
+  template <typename... T>
+  FMT_MAP_API format_arg_store(T&... values)
+      : args{{named_args, NUM_NAMED_ARGS},
+             make_arg<NUM_ARGS <= max_packed_args, Context>(values)...} {
+    using dummy = int[];
+    int arg_index = 0, named_arg_index = 0;
+    (void)dummy{
+        0,
+        (init_named_arg(named_args, arg_index, named_arg_index, values), 0)...};
+  }
+
+  format_arg_store(format_arg_store&& rhs) {
+    args[0] = {named_args, NUM_NAMED_ARGS};
+    for (size_t i = 1; i < ARGS_ARR_SIZE; ++i) args[i] = rhs.args[i];
+    for (size_t i = 0; i < NUM_NAMED_ARGS; ++i)
+      named_args[i] = rhs.named_args[i];
+  }
+
+  format_arg_store(const format_arg_store& rhs) = delete;
+  format_arg_store& operator=(const format_arg_store& rhs) = delete;
+  format_arg_store& operator=(format_arg_store&& rhs) = delete;
+};
+
+// A specialization of format_arg_store without named arguments.
+// It is a plain struct to reduce binary size in debug mode.
+template <typename Context, size_t NUM_ARGS, unsigned long long DESC>
+struct format_arg_store<Context, NUM_ARGS, 0, DESC> {
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  arg_t<Context, NUM_ARGS> args[NUM_ARGS != 0 ? NUM_ARGS : +1];
+};
+
+}  // namespace detail
+FMT_BEGIN_EXPORT
+
+// A formatting argument. Context is a template parameter for the compiled API
+// where output can be unbuffered.
+template <typename Context> class basic_format_arg {
+ private:
+  detail::value<Context> value_;
+  detail::type type_;
+
+  template <typename ContextType, typename T>
+  friend FMT_CONSTEXPR auto detail::make_arg(T& value)
+      -> basic_format_arg<ContextType>;
+
+  friend class basic_format_args<Context>;
+  friend class dynamic_format_arg_store<Context>;
+
+  using char_type = typename Context::char_type;
+
+  template <typename, size_t, size_t, unsigned long long>
+  friend struct detail::format_arg_store;
+
+  basic_format_arg(const detail::named_arg_info<char_type>* args, size_t size)
+      : value_(args, size) {}
+
+ public:
+  class handle {
+   public:
+    explicit handle(detail::custom_value<Context> custom) : custom_(custom) {}
+
+    void format(typename Context::parse_context_type& parse_ctx,
+                Context& ctx) const {
+      custom_.format(custom_.value, parse_ctx, ctx);
+    }
+
+   private:
+    detail::custom_value<Context> custom_;
+  };
+
+  constexpr basic_format_arg() : type_(detail::type::none_type) {}
+
+  constexpr explicit operator bool() const noexcept {
+    return type_ != detail::type::none_type;
+  }
+
+  auto type() const -> detail::type { return type_; }
+
+  auto is_integral() const -> bool { return detail::is_integral_type(type_); }
+  auto is_arithmetic() const -> bool {
+    return detail::is_arithmetic_type(type_);
+  }
+
+  /**
+   * Visits an argument dispatching to the appropriate visit method based on
+   * the argument type. For example, if the argument type is `double` then
+   * `vis(value)` will be called with the value of type `double`.
+   */
+  template <typename Visitor>
+  FMT_CONSTEXPR FMT_INLINE auto visit(Visitor&& vis) const -> decltype(vis(0)) {
+    switch (type_) {
+    case detail::type::none_type:
+      break;
+    case detail::type::int_type:
+      return vis(value_.int_value);
+    case detail::type::uint_type:
+      return vis(value_.uint_value);
+    case detail::type::long_long_type:
+      return vis(value_.long_long_value);
+    case detail::type::ulong_long_type:
+      return vis(value_.ulong_long_value);
+    case detail::type::int128_type:
+      return vis(detail::convert_for_visit(value_.int128_value));
+    case detail::type::uint128_type:
+      return vis(detail::convert_for_visit(value_.uint128_value));
+    case detail::type::bool_type:
+      return vis(value_.bool_value);
+    case detail::type::char_type:
+      return vis(value_.char_value);
+    case detail::type::float_type:
+      return vis(value_.float_value);
+    case detail::type::double_type:
+      return vis(value_.double_value);
+    case detail::type::long_double_type:
+      return vis(value_.long_double_value);
+    case detail::type::cstring_type:
+      return vis(value_.string.data);
+    case detail::type::string_type:
+      using sv = basic_string_view<typename Context::char_type>;
+      return vis(sv(value_.string.data, value_.string.size));
+    case detail::type::pointer_type:
+      return vis(value_.pointer);
+    case detail::type::custom_type:
+      return vis(typename basic_format_arg<Context>::handle(value_.custom));
+    }
+    return vis(monostate());
+  }
+
+  auto format_custom(const char_type* parse_begin,
+                     typename Context::parse_context_type& parse_ctx,
+                     Context& ctx) -> bool {
+    if (type_ != detail::type::custom_type) return false;
+    parse_ctx.advance_to(parse_begin);
+    value_.custom.format(value_.custom.value, parse_ctx, ctx);
+    return true;
+  }
+};
+
+template <typename Visitor, typename Context>
+FMT_DEPRECATED FMT_CONSTEXPR auto visit_format_arg(
+    Visitor&& vis, const basic_format_arg<Context>& arg) -> decltype(vis(0)) {
+  return arg.visit(static_cast<Visitor&&>(vis));
+}
+
+/**
+ * A view of a collection of formatting arguments. To avoid lifetime issues it
+ * should only be used as a parameter type in type-erased functions such as
+ * `vformat`:
+ *
+ *     void vlog(fmt::string_view fmt, fmt::format_args args);  // OK
+ *     fmt::format_args args = fmt::make_format_args();  // Dangling reference
+ */
+template <typename Context> class basic_format_args {
+ public:
+  using size_type = int;
+  using format_arg = basic_format_arg<Context>;
+
+ private:
+  // A descriptor that contains information about formatting arguments.
+  // If the number of arguments is less or equal to max_packed_args then
+  // argument types are passed in the descriptor. This reduces binary code size
+  // per formatting function call.
+  unsigned long long desc_;
+  union {
+    // If is_packed() returns true then argument values are stored in values_;
+    // otherwise they are stored in args_. This is done to improve cache
+    // locality and reduce compiled code size since storing larger objects
+    // may require more code (at least on x86-64) even if the same amount of
+    // data is actually copied to stack. It saves ~10% on the bloat test.
+    const detail::value<Context>* values_;
+    const format_arg* args_;
+  };
+
+  constexpr auto is_packed() const -> bool {
+    return (desc_ & detail::is_unpacked_bit) == 0;
+  }
+  constexpr auto has_named_args() const -> bool {
+    return (desc_ & detail::has_named_args_bit) != 0;
+  }
+
+  FMT_CONSTEXPR auto type(int index) const -> detail::type {
+    int shift = index * detail::packed_arg_bits;
+    unsigned int mask = (1 << detail::packed_arg_bits) - 1;
+    return static_cast<detail::type>((desc_ >> shift) & mask);
+  }
+
+ public:
+  constexpr basic_format_args() : desc_(0), args_(nullptr) {}
+
+  /// Constructs a `basic_format_args` object from `format_arg_store`.
+  template <size_t NUM_ARGS, size_t NUM_NAMED_ARGS, unsigned long long DESC,
+            FMT_ENABLE_IF(NUM_ARGS <= detail::max_packed_args)>
+  constexpr FMT_ALWAYS_INLINE basic_format_args(
+      const detail::format_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC>&
+          store)
+      : desc_(DESC), values_(store.args + (NUM_NAMED_ARGS != 0 ? 1 : 0)) {}
+
+  template <size_t NUM_ARGS, size_t NUM_NAMED_ARGS, unsigned long long DESC,
+            FMT_ENABLE_IF(NUM_ARGS > detail::max_packed_args)>
+  constexpr basic_format_args(
+      const detail::format_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC>&
+          store)
+      : desc_(DESC), args_(store.args + (NUM_NAMED_ARGS != 0 ? 1 : 0)) {}
+
+  /// Constructs a `basic_format_args` object from `dynamic_format_arg_store`.
+  constexpr basic_format_args(const dynamic_format_arg_store<Context>& store)
+      : desc_(store.get_types()), args_(store.data()) {}
+
+  /// Constructs a `basic_format_args` object from a dynamic list of arguments.
+  constexpr basic_format_args(const format_arg* args, int count)
+      : desc_(detail::is_unpacked_bit | detail::to_unsigned(count)),
+        args_(args) {}
+
+  /// Returns the argument with the specified id.
+  FMT_CONSTEXPR auto get(int id) const -> format_arg {
+    format_arg arg;
+    if (!is_packed()) {
+      if (id < max_size()) arg = args_[id];
+      return arg;
+    }
+    if (static_cast<unsigned>(id) >= detail::max_packed_args) return arg;
+    arg.type_ = type(id);
+    if (arg.type_ == detail::type::none_type) return arg;
+    arg.value_ = values_[id];
+    return arg;
+  }
+
+  template <typename Char>
+  auto get(basic_string_view<Char> name) const -> format_arg {
+    int id = get_id(name);
+    return id >= 0 ? get(id) : format_arg();
+  }
+
+  template <typename Char>
+  FMT_CONSTEXPR auto get_id(basic_string_view<Char> name) const -> int {
+    if (!has_named_args()) return -1;
+    const auto& named_args =
+        (is_packed() ? values_[-1] : args_[-1].value_).named_args;
+    for (size_t i = 0; i < named_args.size; ++i) {
+      if (named_args.data[i].name == name) return named_args.data[i].id;
+    }
+    return -1;
+  }
+
+  auto max_size() const -> int {
+    unsigned long long max_packed = detail::max_packed_args;
+    return static_cast<int>(is_packed() ? max_packed
+                                        : desc_ & ~detail::is_unpacked_bit);
+  }
+};
+
+// A formatting context.
+class context {
+ private:
+  appender out_;
+  basic_format_args<context> args_;
+  detail::locale_ref loc_;
+
+ public:
+  /// The character type for the output.
+  using char_type = char;
+
+  using iterator = appender;
+  using format_arg = basic_format_arg<context>;
+  using parse_context_type = basic_format_parse_context<char>;
+  template <typename T> using formatter_type = formatter<T, char>;
+
+  /// Constructs a `basic_format_context` object. References to the arguments
+  /// are stored in the object so make sure they have appropriate lifetimes.
+  FMT_CONSTEXPR context(iterator out, basic_format_args<context> ctx_args,
+                        detail::locale_ref loc = {})
+      : out_(out), args_(ctx_args), loc_(loc) {}
+  context(context&&) = default;
+  context(const context&) = delete;
+  void operator=(const context&) = delete;
+
+  FMT_CONSTEXPR auto arg(int id) const -> format_arg { return args_.get(id); }
+  auto arg(string_view name) -> format_arg { return args_.get(name); }
+  FMT_CONSTEXPR auto arg_id(string_view name) -> int {
+    return args_.get_id(name);
+  }
+  auto args() const -> const basic_format_args<context>& { return args_; }
+
+  // Returns an iterator to the beginning of the output range.
+  FMT_CONSTEXPR auto out() -> iterator { return out_; }
+
+  // Advances the begin iterator to `it`.
+  void advance_to(iterator) {}
+
+  FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
+};
+
+template <typename OutputIt, typename Char> class generic_context;
+
+// Longer aliases for C++20 compatibility.
+template <typename OutputIt, typename Char>
+using basic_format_context =
+    conditional_t<std::is_same<OutputIt, appender>::value, context,
+                  generic_context<OutputIt, Char>>;
+using format_context = context;
+
+template <typename Char>
+using buffered_context = basic_format_context<basic_appender<Char>, Char>;
+
+template <typename T, typename Char = char>
+using is_formattable = bool_constant<!std::is_base_of<
+    detail::unformattable, decltype(detail::arg_mapper<buffered_context<Char>>()
+                                        .map(std::declval<T&>()))>::value>;
+
+#if FMT_USE_CONCEPTS
+template <typename T, typename Char = char>
+concept formattable = is_formattable<remove_reference_t<T>, Char>::value;
+#endif
+
+/**
+ * Constructs an object that stores references to arguments and can be
+ * implicitly converted to `format_args`. `Context` can be omitted in which case
+ * it defaults to `format_context`. See `arg` for lifetime considerations.
+ */
+// Take arguments by lvalue references to avoid some lifetime issues, e.g.
+//   auto args = make_format_args(std::string());
+template <typename Context = format_context, typename... T,
+          size_t NUM_ARGS = sizeof...(T),
+          size_t NUM_NAMED_ARGS = detail::count_named_args<T...>(),
+          unsigned long long DESC = detail::make_descriptor<Context, T...>(),
+          FMT_ENABLE_IF(NUM_NAMED_ARGS == 0)>
+constexpr FMT_ALWAYS_INLINE auto make_format_args(T&... args)
+    -> detail::format_arg_store<Context, NUM_ARGS, 0, DESC> {
+  return {{detail::make_arg<NUM_ARGS <= detail::max_packed_args, Context>(
+      args)...}};
+}
+
+#ifndef FMT_DOC
+template <typename Context = format_context, typename... T,
+          size_t NUM_NAMED_ARGS = detail::count_named_args<T...>(),
+          unsigned long long DESC =
+              detail::make_descriptor<Context, T...>() |
+              static_cast<unsigned long long>(detail::has_named_args_bit),
+          FMT_ENABLE_IF(NUM_NAMED_ARGS != 0)>
+constexpr auto make_format_args(T&... args)
+    -> detail::format_arg_store<Context, sizeof...(T), NUM_NAMED_ARGS, DESC> {
+  return {args...};
+}
+#endif
+
+/**
+ * Returns a named argument to be used in a formatting function.
+ * It should only be used in a call to a formatting function or
+ * `dynamic_format_arg_store::push_back`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("The answer is {answer}.", fmt::arg("answer", 42));
+ */
+template <typename Char, typename T>
+inline auto arg(const Char* name, const T& arg) -> detail::named_arg<Char, T> {
+  static_assert(!detail::is_named_arg<T>(), "nested named arguments");
+  return {name, arg};
+}
+FMT_END_EXPORT
+
+/// An alias for `basic_format_args<format_context>`.
+// A separate type would result in shorter symbols but break ABI compatibility
+// between clang and gcc on ARM (#1919).
+FMT_EXPORT using format_args = basic_format_args<format_context>;
+
+// We cannot use enum classes as bit fields because of a gcc bug, so we put them
+// in namespaces instead (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414).
+// Additionally, if an underlying type is specified, older gcc incorrectly warns
+// that the type is too small. Both bugs are fixed in gcc 9.3.
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 903
+#  define FMT_ENUM_UNDERLYING_TYPE(type)
+#else
+#  define FMT_ENUM_UNDERLYING_TYPE(type) : type
+#endif
+namespace align {
+enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, left, right, center,
+                                                  numeric};
+}
+using align_t = align::type;
+namespace sign {
+enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, minus, plus, space};
+}
+using sign_t = sign::type;
+
+namespace detail {
+
+template <typename Char>
+using unsigned_char = typename conditional_t<std::is_integral<Char>::value,
+                                             std::make_unsigned<Char>,
+                                             type_identity<unsigned>>::type;
+
+// Character (code unit) type is erased to prevent template bloat.
+struct fill_t {
+ private:
+  enum { max_size = 4 };
+  char data_[max_size] = {' '};
+  unsigned char size_ = 1;
+
+ public:
+  template <typename Char>
+  FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
+    auto size = s.size();
+    size_ = static_cast<unsigned char>(size);
+    if (size == 1) {
+      unsigned uchar = static_cast<unsigned_char<Char>>(s[0]);
+      data_[0] = static_cast<char>(uchar);
+      data_[1] = static_cast<char>(uchar >> 8);
+      return;
+    }
+    FMT_ASSERT(size <= max_size, "invalid fill");
+    for (size_t i = 0; i < size; ++i) data_[i] = static_cast<char>(s[i]);
+  }
+
+  FMT_CONSTEXPR void operator=(char c) {
+    data_[0] = c;
+    size_ = 1;
+  }
+
+  constexpr auto size() const -> size_t { return size_; }
+
+  template <typename Char> constexpr auto get() const -> Char {
+    using uchar = unsigned char;
+    return static_cast<Char>(static_cast<uchar>(data_[0]) |
+                             (static_cast<uchar>(data_[1]) << 8));
+  }
+
+  template <typename Char, FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+  constexpr auto data() const -> const Char* {
+    return data_;
+  }
+  template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+  constexpr auto data() const -> const Char* {
+    return nullptr;
+  }
+};
+}  // namespace detail
+
+enum class presentation_type : unsigned char {
+  // Common specifiers:
+  none = 0,
+  debug = 1,   // '?'
+  string = 2,  // 's' (string, bool)
+
+  // Integral, bool and character specifiers:
+  dec = 3,  // 'd'
+  hex,      // 'x' or 'X'
+  oct,      // 'o'
+  bin,      // 'b' or 'B'
+  chr,      // 'c'
+
+  // String and pointer specifiers:
+  pointer = 3,  // 'p'
+
+  // Floating-point specifiers:
+  exp = 1,  // 'e' or 'E' (1 since there is no FP debug presentation)
+  fixed,    // 'f' or 'F'
+  general,  // 'g' or 'G'
+  hexfloat  // 'a' or 'A'
+};
+
+// Format specifiers for built-in and string types.
+struct format_specs {
+  int width;
+  int precision;
+  presentation_type type;
+  align_t align : 4;
+  sign_t sign : 3;
+  bool upper : 1;  // An uppercase version e.g. 'X' for 'x'.
+  bool alt : 1;    // Alternate form ('#').
+  bool localized : 1;
+  detail::fill_t fill;
+
+  constexpr format_specs()
+      : width(0),
+        precision(-1),
+        type(presentation_type::none),
+        align(align::none),
+        sign(sign::none),
+        upper(false),
+        alt(false),
+        localized(false) {}
+};
+
+namespace detail {
+
+enum class arg_id_kind { none, index, name };
+
+// An argument reference.
+template <typename Char> struct arg_ref {
+  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
+
+  FMT_CONSTEXPR explicit arg_ref(int index)
+      : kind(arg_id_kind::index), val(index) {}
+  FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
+      : kind(arg_id_kind::name), val(name) {}
+
+  FMT_CONSTEXPR auto operator=(int idx) -> arg_ref& {
+    kind = arg_id_kind::index;
+    val.index = idx;
+    return *this;
+  }
+
+  arg_id_kind kind;
+  union value {
+    FMT_CONSTEXPR value(int idx = 0) : index(idx) {}
+    FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
+
+    int index;
+    basic_string_view<Char> name;
+  } val;
+};
+
+// Format specifiers with width and precision resolved at formatting rather
+// than parsing time to allow reusing the same parsed specifiers with
+// different sets of arguments (precompilation of format strings).
+template <typename Char = char> struct dynamic_format_specs : format_specs {
+  arg_ref<Char> width_ref;
+  arg_ref<Char> precision_ref;
+};
+
+// Converts a character to ASCII. Returns '\0' on conversion failure.
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
+constexpr auto to_ascii(Char c) -> char {
+  return c <= 0xff ? static_cast<char>(c) : '\0';
+}
+
+// Returns the number of code units in a code point or 1 on error.
+template <typename Char>
+FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  auto c = static_cast<unsigned char>(*begin);
+  return static_cast<int>((0x3a55000000000000ull >> (2 * (c >> 3))) & 0x3) + 1;
+}
+
+// Return the result via the out param to workaround gcc bug 77539.
+template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
+FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr& out) -> bool {
+  for (out = first; out != last; ++out) {
+    if (*out == value) return true;
+  }
+  return false;
+}
+
+template <>
+inline auto find<false, char>(const char* first, const char* last, char value,
+                              const char*& out) -> bool {
+  out =
+      static_cast<const char*>(memchr(first, value, to_unsigned(last - first)));
+  return out != nullptr;
+}
+
+// Parses the range [begin, end) as an unsigned integer. This function assumes
+// that the range is non-empty and the first character is a digit.
+template <typename Char>
+FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
+                                         int error_value) noexcept -> int {
+  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
+  unsigned value = 0, prev = 0;
+  auto p = begin;
+  do {
+    prev = value;
+    value = value * 10 + unsigned(*p - '0');
+    ++p;
+  } while (p != end && '0' <= *p && *p <= '9');
+  auto num_digits = p - begin;
+  begin = p;
+  int digits10 = static_cast<int>(sizeof(int) * CHAR_BIT * 3 / 10);
+  if (num_digits <= digits10) return static_cast<int>(value);
+  // Check for overflow.
+  unsigned max = INT_MAX;
+  return num_digits == digits10 + 1 &&
+                 prev * 10ull + unsigned(p[-1] - '0') <= max
+             ? static_cast<int>(value)
+             : error_value;
+}
+
+FMT_CONSTEXPR inline auto parse_align(char c) -> align_t {
+  switch (c) {
+  case '<':
+    return align::left;
+  case '>':
+    return align::right;
+  case '^':
+    return align::center;
+  }
+  return align::none;
+}
+
+template <typename Char> constexpr auto is_name_start(Char c) -> bool {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto do_parse_arg_id(const Char* begin, const Char* end,
+                                   Handler&& handler) -> const Char* {
+  Char c = *begin;
+  if (c >= '0' && c <= '9') {
+    int index = 0;
+    if (c != '0')
+      index = parse_nonnegative_int(begin, end, INT_MAX);
+    else
+      ++begin;
+    if (begin == end || (*begin != '}' && *begin != ':'))
+      report_error("invalid format string");
+    else
+      handler.on_index(index);
+    return begin;
+  }
+  if (!is_name_start(c)) {
+    report_error("invalid format string");
+    return begin;
+  }
+  auto it = begin;
+  do {
+    ++it;
+  } while (it != end && (is_name_start(*it) || ('0' <= *it && *it <= '9')));
+  handler.on_name({begin, to_unsigned(it - begin)});
+  return it;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_arg_id(const Char* begin, const Char* end,
+                                Handler&& handler) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  Char c = *begin;
+  if (c != '}' && c != ':') return do_parse_arg_id(begin, end, handler);
+  handler.on_auto();
+  return begin;
+}
+
+template <typename Char> struct dynamic_spec_id_handler {
+  basic_format_parse_context<Char>& ctx;
+  arg_ref<Char>& ref;
+
+  FMT_CONSTEXPR void on_auto() {
+    int id = ctx.next_arg_id();
+    ref = arg_ref<Char>(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_index(int id) {
+    ref = arg_ref<Char>(id);
+    ctx.check_arg_id(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+    ref = arg_ref<Char>(id);
+    ctx.check_arg_id(id);
+  }
+};
+
+// Parses [integer | "{" [arg_id] "}"].
+template <typename Char>
+FMT_CONSTEXPR auto parse_dynamic_spec(const Char* begin, const Char* end,
+                                      int& value, arg_ref<Char>& ref,
+                                      basic_format_parse_context<Char>& ctx)
+    -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  if ('0' <= *begin && *begin <= '9') {
+    int val = parse_nonnegative_int(begin, end, -1);
+    if (val != -1)
+      value = val;
+    else
+      report_error("number is too big");
+  } else if (*begin == '{') {
+    ++begin;
+    auto handler = dynamic_spec_id_handler<Char>{ctx, ref};
+    if (begin != end) begin = parse_arg_id(begin, end, handler);
+    if (begin != end && *begin == '}') return ++begin;
+    report_error("invalid format string");
+  }
+  return begin;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end,
+                                   int& value, arg_ref<Char>& ref,
+                                   basic_format_parse_context<Char>& ctx)
+    -> const Char* {
+  ++begin;
+  if (begin == end || *begin == '}') {
+    report_error("invalid precision");
+    return begin;
+  }
+  return parse_dynamic_spec(begin, end, value, ref, ctx);
+}
+
+enum class state { start, align, sign, hash, zero, width, precision, locale };
+
+// Parses standard format specifiers.
+template <typename Char>
+FMT_CONSTEXPR auto parse_format_specs(const Char* begin, const Char* end,
+                                      dynamic_format_specs<Char>& specs,
+                                      basic_format_parse_context<Char>& ctx,
+                                      type arg_type) -> const Char* {
+  auto c = '\0';
+  if (end - begin > 1) {
+    auto next = to_ascii(begin[1]);
+    c = parse_align(next) == align::none ? to_ascii(*begin) : '\0';
+  } else {
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+
+  struct {
+    state current_state = state::start;
+    FMT_CONSTEXPR void operator()(state s, bool valid = true) {
+      if (current_state >= s || !valid)
+        report_error("invalid format specifier");
+      current_state = s;
+    }
+  } enter_state;
+
+  using pres = presentation_type;
+  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
+  struct {
+    const Char*& begin;
+    dynamic_format_specs<Char>& specs;
+    type arg_type;
+
+    FMT_CONSTEXPR auto operator()(pres pres_type, int set) -> const Char* {
+      if (!in(arg_type, set)) {
+        if (arg_type == type::none_type) return begin;
+        report_error("invalid format specifier");
+      }
+      specs.type = pres_type;
+      return begin + 1;
+    }
+  } parse_presentation_type{begin, specs, arg_type};
+
+  for (;;) {
+    switch (c) {
+    case '<':
+    case '>':
+    case '^':
+      enter_state(state::align);
+      specs.align = parse_align(c);
+      ++begin;
+      break;
+    case '+':
+    case '-':
+    case ' ':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::sign, in(arg_type, sint_set | float_set));
+      switch (c) {
+      case '+':
+        specs.sign = sign::plus;
+        break;
+      case '-':
+        specs.sign = sign::minus;
+        break;
+      case ' ':
+        specs.sign = sign::space;
+        break;
+      }
+      ++begin;
+      break;
+    case '#':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::hash, is_arithmetic_type(arg_type));
+      specs.alt = true;
+      ++begin;
+      break;
+    case '0':
+      enter_state(state::zero);
+      if (!is_arithmetic_type(arg_type)) {
+        if (arg_type == type::none_type) return begin;
+        report_error("format specifier requires numeric argument");
+      }
+      if (specs.align == align::none) {
+        // Ignore 0 if align is specified for compatibility with std::format.
+        specs.align = align::numeric;
+        specs.fill = '0';
+      }
+      ++begin;
+      break;
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+    case '{':
+      enter_state(state::width);
+      begin = parse_dynamic_spec(begin, end, specs.width, specs.width_ref, ctx);
+      break;
+    case '.':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::precision,
+                  in(arg_type, float_set | string_set | cstring_set));
+      begin = parse_precision(begin, end, specs.precision, specs.precision_ref,
+                              ctx);
+      break;
+    case 'L':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::locale, is_arithmetic_type(arg_type));
+      specs.localized = true;
+      ++begin;
+      break;
+    case 'd':
+      return parse_presentation_type(pres::dec, integral_set);
+    case 'X':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'x':
+      return parse_presentation_type(pres::hex, integral_set);
+    case 'o':
+      return parse_presentation_type(pres::oct, integral_set);
+    case 'B':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'b':
+      return parse_presentation_type(pres::bin, integral_set);
+    case 'E':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'e':
+      return parse_presentation_type(pres::exp, float_set);
+    case 'F':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'f':
+      return parse_presentation_type(pres::fixed, float_set);
+    case 'G':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'g':
+      return parse_presentation_type(pres::general, float_set);
+    case 'A':
+      specs.upper = true;
+      FMT_FALLTHROUGH;
+    case 'a':
+      return parse_presentation_type(pres::hexfloat, float_set);
+    case 'c':
+      if (arg_type == type::bool_type) report_error("invalid format specifier");
+      return parse_presentation_type(pres::chr, integral_set);
+    case 's':
+      return parse_presentation_type(pres::string,
+                                     bool_set | string_set | cstring_set);
+    case 'p':
+      return parse_presentation_type(pres::pointer, pointer_set | cstring_set);
+    case '?':
+      return parse_presentation_type(pres::debug,
+                                     char_set | string_set | cstring_set);
+    case '}':
+      return begin;
+    default: {
+      if (*begin == '}') return begin;
+      // Parse fill and alignment.
+      auto fill_end = begin + code_point_length(begin);
+      if (end - fill_end <= 0) {
+        report_error("invalid format specifier");
+        return begin;
+      }
+      if (*begin == '{') {
+        report_error("invalid fill character '{'");
+        return begin;
+      }
+      auto align = parse_align(to_ascii(*fill_end));
+      enter_state(state::align, align != align::none);
+      specs.fill =
+          basic_string_view<Char>(begin, to_unsigned(fill_end - begin));
+      specs.align = align;
+      begin = fill_end + 1;
+    }
+    }
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_replacement_field(const Char* begin, const Char* end,
+                                           Handler&& handler) -> const Char* {
+  struct id_adapter {
+    Handler& handler;
+    int arg_id;
+
+    FMT_CONSTEXPR void on_auto() { arg_id = handler.on_arg_id(); }
+    FMT_CONSTEXPR void on_index(int id) { arg_id = handler.on_arg_id(id); }
+    FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+      arg_id = handler.on_arg_id(id);
+    }
+  };
+
+  ++begin;
+  if (begin == end) return handler.on_error("invalid format string"), end;
+  if (*begin == '}') {
+    handler.on_replacement_field(handler.on_arg_id(), begin);
+  } else if (*begin == '{') {
+    handler.on_text(begin, begin + 1);
+  } else {
+    auto adapter = id_adapter{handler, 0};
+    begin = parse_arg_id(begin, end, adapter);
+    Char c = begin != end ? *begin : Char();
+    if (c == '}') {
+      handler.on_replacement_field(adapter.arg_id, begin);
+    } else if (c == ':') {
+      begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
+      if (begin == end || *begin != '}')
+        return handler.on_error("unknown format specifier"), end;
+    } else {
+      return handler.on_error("missing '}' in format string"), end;
+    }
+  }
+  return begin + 1;
+}
+
+template <bool IS_CONSTEXPR, typename Char, typename Handler>
+FMT_CONSTEXPR void parse_format_string(basic_string_view<Char> format_str,
+                                       Handler&& handler) {
+  auto begin = format_str.data();
+  auto end = begin + format_str.size();
+  if (end - begin < 32) {
+    // Use a simple loop instead of memchr for small strings.
+    const Char* p = begin;
+    while (p != end) {
+      auto c = *p++;
+      if (c == '{') {
+        handler.on_text(begin, p - 1);
+        begin = p = parse_replacement_field(p - 1, end, handler);
+      } else if (c == '}') {
+        if (p == end || *p != '}')
+          return handler.on_error("unmatched '}' in format string");
+        handler.on_text(begin, p);
+        begin = ++p;
+      }
+    }
+    handler.on_text(begin, end);
+    return;
+  }
+  struct writer {
+    FMT_CONSTEXPR void operator()(const Char* from, const Char* to) {
+      if (from == to) return;
+      for (;;) {
+        const Char* p = nullptr;
+        if (!find<IS_CONSTEXPR>(from, to, Char('}'), p))
+          return handler_.on_text(from, to);
+        ++p;
+        if (p == to || *p != '}')
+          return handler_.on_error("unmatched '}' in format string");
+        handler_.on_text(from, p);
+        from = p + 1;
+      }
+    }
+    Handler& handler_;
+  } write = {handler};
+  while (begin != end) {
+    // Doing two passes with memchr (one for '{' and another for '}') is up to
+    // 2.5x faster than the naive one-pass implementation on big format strings.
+    const Char* p = begin;
+    if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, Char('{'), p))
+      return write(begin, end);
+    write(begin, p);
+    begin = parse_replacement_field(p, end, handler);
+  }
+}
+
+template <typename T, bool = is_named_arg<T>::value> struct strip_named_arg {
+  using type = T;
+};
+template <typename T> struct strip_named_arg<T, true> {
+  using type = remove_cvref_t<decltype(T::value)>;
+};
+
+template <typename T, typename ParseContext>
+FMT_VISIBILITY("hidden")  // Suppress an ld warning on macOS (#3769).
+FMT_CONSTEXPR auto parse_format_specs(ParseContext& ctx)
+    -> decltype(ctx.begin()) {
+  using char_type = typename ParseContext::char_type;
+  using context = buffered_context<char_type>;
+  using mapped_type = conditional_t<
+      mapped_type_constant<T, context>::value != type::custom_type,
+      decltype(arg_mapper<context>().map(std::declval<const T&>())),
+      typename strip_named_arg<T>::type>;
+#if defined(__cpp_if_constexpr)
+  if constexpr (std::is_default_constructible<
+                    formatter<mapped_type, char_type>>::value) {
+    return formatter<mapped_type, char_type>().parse(ctx);
+  } else {
+    type_is_unformattable_for<T, char_type> _;
+    return ctx.begin();
+  }
+#else
+  return formatter<mapped_type, char_type>().parse(ctx);
+#endif
+}
+
+// Checks char specs and returns true iff the presentation type is char-like.
+FMT_CONSTEXPR inline auto check_char_specs(const format_specs& specs) -> bool {
+  if (specs.type != presentation_type::none &&
+      specs.type != presentation_type::chr &&
+      specs.type != presentation_type::debug) {
+    return false;
+  }
+  if (specs.align == align::numeric || specs.sign != sign::none || specs.alt)
+    report_error("invalid format specifier for char");
+  return true;
+}
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <int N, typename T, typename... Args, typename Char>
+constexpr auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+  if constexpr (is_statically_named_arg<T>()) {
+    if (name == T::name) return N;
+  }
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<N + 1, Args...>(name);
+  (void)name;  // Workaround an MSVC bug about "unused" parameter.
+  return -1;
+}
+#endif
+
+template <typename... Args, typename Char>
+FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<0, Args...>(name);
+#endif
+  (void)name;
+  return -1;
+}
+
+template <typename Char, typename... Args> class format_string_checker {
+ private:
+  using parse_context_type = compile_parse_context<Char>;
+  static constexpr int num_args = sizeof...(Args);
+
+  // Format specifier parsing function.
+  // In the future basic_format_parse_context will replace compile_parse_context
+  // here and will use is_constant_evaluated and downcasting to access the data
+  // needed for compile-time checks: https://godbolt.org/z/GvWzcTjh1.
+  using parse_func = const Char* (*)(parse_context_type&);
+
+  type types_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
+  parse_context_type context_;
+  parse_func parse_funcs_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
+
+ public:
+  explicit FMT_CONSTEXPR format_string_checker(basic_string_view<Char> fmt)
+      : types_{mapped_type_constant<Args, buffered_context<Char>>::value...},
+        context_(fmt, num_args, types_),
+        parse_funcs_{&parse_format_specs<Args, parse_context_type>...} {}
+
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+
+  FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
+  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+    return context_.check_arg_id(id), id;
+  }
+  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+    auto index = get_arg_index_by_name<Args...>(id);
+    if (index < 0) on_error("named argument is not found");
+    return index;
+#else
+    (void)id;
+    on_error("compile-time checks for named arguments require C++20 support");
+    return 0;
+#endif
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int id, const Char* begin) {
+    on_format_specs(id, begin, begin);  // Call parse() on empty specs.
+  }
+
+  FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char*)
+      -> const Char* {
+    context_.advance_to(begin);
+    // id >= 0 check is a workaround for gcc 10 bug (#2065).
+    return id >= 0 && id < num_args ? parse_funcs_[id](context_) : begin;
+  }
+
+  FMT_NORETURN FMT_CONSTEXPR void on_error(const char* message) {
+    report_error(message);
+  }
+};
+
+// A base class for compile-time strings.
+struct compile_string {};
+
+template <typename S>
+using is_compile_string = std::is_base_of<compile_string, S>;
+
+// Reports a compile-time error if S is not a valid format string.
+template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
+FMT_ALWAYS_INLINE void check_format_string(const S&) {
+#ifdef FMT_ENFORCE_COMPILE_STRING
+  static_assert(is_compile_string<S>::value,
+                "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
+                "FMT_STRING.");
+#endif
+}
+template <typename... Args, typename S,
+          FMT_ENABLE_IF(is_compile_string<S>::value)>
+void check_format_string(S format_str) {
+  using char_t = typename S::char_type;
+  FMT_CONSTEXPR auto s = basic_string_view<char_t>(format_str);
+  using checker = format_string_checker<char_t, remove_cvref_t<Args>...>;
+  FMT_CONSTEXPR bool error = (parse_format_string<true>(s, checker(s)), true);
+  ignore_unused(error);
+}
+
+// Report truncation to prevent silent data loss.
+inline void report_truncation(bool truncated) {
+  if (truncated) report_error("output is truncated");
+}
+
+// Use vformat_args and avoid type_identity to keep symbols short and workaround
+// a GCC <= 4.8 bug.
+template <typename Char = char> struct vformat_args {
+  using type = basic_format_args<buffered_context<Char>>;
+};
+template <> struct vformat_args<char> {
+  using type = format_args;
+};
+
+template <typename Char>
+void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
+                typename vformat_args<Char>::type args, locale_ref loc = {});
+
+FMT_API void vprint_mojibake(FILE*, string_view, format_args, bool = false);
+#ifndef _WIN32
+inline void vprint_mojibake(FILE*, string_view, format_args, bool) {}
+#endif
+
+template <typename T, typename Char, type TYPE> struct native_formatter {
+ private:
+  dynamic_format_specs<Char> specs_;
+
+ public:
+  using nonlocking = void;
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
+    if (ctx.begin() == ctx.end() || *ctx.begin() == '}') return ctx.begin();
+    auto end = parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx, TYPE);
+    if (const_check(TYPE == type::char_type)) check_char_specs(specs_);
+    return end;
+  }
+
+  template <type U = TYPE,
+            FMT_ENABLE_IF(U == type::string_type || U == type::cstring_type ||
+                          U == type::char_type)>
+  FMT_CONSTEXPR void set_debug_format(bool set = true) {
+    specs_.type = set ? presentation_type::debug : presentation_type::none;
+  }
+
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const
+      -> decltype(ctx.out());
+};
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// A formatter specialization for natively supported types.
+template <typename T, typename Char>
+struct formatter<T, Char,
+                 enable_if_t<detail::type_constant<T, Char>::value !=
+                             detail::type::custom_type>>
+    : detail::native_formatter<T, Char, detail::type_constant<T, Char>::value> {
+};
+
+template <typename Char = char> struct runtime_format_string {
+  basic_string_view<Char> str;
+};
+
+/// A compile-time format string.
+template <typename Char, typename... Args> class basic_format_string {
+ private:
+  basic_string_view<Char> str_;
+
+ public:
+  template <
+      typename S,
+      FMT_ENABLE_IF(
+          std::is_convertible<const S&, basic_string_view<Char>>::value ||
+          (detail::is_compile_string<S>::value &&
+           std::is_constructible<basic_string_view<Char>, const S&>::value))>
+  FMT_CONSTEVAL FMT_ALWAYS_INLINE basic_format_string(const S& s) : str_(s) {
+    static_assert(
+        detail::count<
+            (std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
+             std::is_reference<Args>::value)...>() == 0,
+        "passing views as lvalues is disallowed");
+#if FMT_USE_CONSTEVAL
+    if constexpr (detail::count_named_args<Args...>() ==
+                  detail::count_statically_named_args<Args...>()) {
+      using checker =
+          detail::format_string_checker<Char, remove_cvref_t<Args>...>;
+      detail::parse_format_string<true>(str_, checker(s));
+    }
+#else
+    detail::check_format_string<Args...>(s);
+#endif
+  }
+  basic_format_string(runtime_format_string<Char> fmt) : str_(fmt.str) {}
+
+  FMT_ALWAYS_INLINE operator basic_string_view<Char>() const { return str_; }
+  auto get() const -> basic_string_view<Char> { return str_; }
+};
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename...> using format_string = string_view;
+inline auto runtime(string_view s) -> string_view { return s; }
+#else
+template <typename... Args>
+using format_string = basic_format_string<char, type_identity_t<Args>...>;
+/**
+ * Creates a runtime format string.
+ *
+ * **Example**:
+ *
+ *     // Check format string at runtime instead of compile-time.
+ *     fmt::print(fmt::runtime("{:d}"), "I am not a number");
+ */
+inline auto runtime(string_view s) -> runtime_format_string<> { return {{s}}; }
+#endif
+
+/// Formats a string and writes the output to `out`.
+template <typename OutputIt,
+          FMT_ENABLE_IF(detail::is_output_iterator<remove_cvref_t<OutputIt>,
+                                                   char>::value)>
+auto vformat_to(OutputIt&& out, string_view fmt, format_args args)
+    -> remove_cvref_t<OutputIt> {
+  auto&& buf = detail::get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, {});
+  return detail::get_iterator(buf, out);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt`, writes the result to
+ * the output iterator `out` and returns the iterator past the end of the output
+ * range. `format_to` does not append a terminating null character.
+ *
+ * **Example**:
+ *
+ *     auto out = std::vector<char>();
+ *     fmt::format_to(std::back_inserter(out), "{}", 42);
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<remove_cvref_t<OutputIt>,
+                                                   char>::value)>
+FMT_INLINE auto format_to(OutputIt&& out, format_string<T...> fmt, T&&... args)
+    -> remove_cvref_t<OutputIt> {
+  return vformat_to(FMT_FWD(out), fmt, fmt::make_format_args(args...));
+}
+
+template <typename OutputIt> struct format_to_n_result {
+  /// Iterator past the end of the output range.
+  OutputIt out;
+  /// Total (not truncated) output size.
+  size_t size;
+};
+
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  detail::vformat_to(buf, fmt, args, {});
+  return {buf.out(), buf.count()};
+}
+
+/**
+ * Formats `args` according to specifications in `fmt`, writes up to `n`
+ * characters of the result to the output iterator `out` and returns the total
+ * (not truncated) output size and the iterator past the end of the output
+ * range. `format_to_n` does not append a terminating null character.
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
+                            T&&... args) -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, fmt, fmt::make_format_args(args...));
+}
+
+template <typename OutputIt, typename Sentinel = OutputIt>
+struct format_to_result {
+  /// Iterator pointing to just after the last successful write in the range.
+  OutputIt out;
+  /// Specifies if the output was truncated.
+  bool truncated;
+
+  FMT_CONSTEXPR operator OutputIt&() & {
+    detail::report_truncation(truncated);
+    return out;
+  }
+  FMT_CONSTEXPR operator const OutputIt&() const& {
+    detail::report_truncation(truncated);
+    return out;
+  }
+  FMT_CONSTEXPR operator OutputIt&&() && {
+    detail::report_truncation(truncated);
+    return static_cast<OutputIt&&>(out);
+  }
+};
+
+template <size_t N>
+auto vformat_to(char (&out)[N], string_view fmt, format_args args)
+    -> format_to_result<char*> {
+  auto result = vformat_to_n(out, N, fmt, args);
+  return {result.out, result.size > N};
+}
+
+template <size_t N, typename... T>
+FMT_INLINE auto format_to(char (&out)[N], format_string<T...> fmt, T&&... args)
+    -> format_to_result<char*> {
+  auto result = fmt::format_to_n(out, N, fmt, static_cast<T&&>(args)...);
+  return {result.out, result.size > N};
+}
+
+/// Returns the number of chars in the output of `format(fmt, args...)`.
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto formatted_size(format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...), {});
+  return buf.count();
+}
+
+FMT_API void vprint(string_view fmt, format_args args);
+FMT_API void vprint(FILE* f, string_view fmt, format_args args);
+FMT_API void vprint_buffered(FILE* f, string_view fmt, format_args args);
+FMT_API void vprintln(FILE* f, string_view fmt, format_args args);
+
+/**
+ * Formats `args` according to specifications in `fmt` and writes the output
+ * to `stdout`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("The answer is {}.", 42);
+ */
+template <typename... T>
+FMT_INLINE void print(format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  if (!detail::use_utf8()) return detail::vprint_mojibake(stdout, fmt, vargs);
+  return detail::is_locking<T...>() ? vprint_buffered(stdout, fmt, vargs)
+                                    : vprint(fmt, vargs);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt` and writes the
+ * output to the file `f`.
+ *
+ * **Example**:
+ *
+ *     fmt::print(stderr, "Don't {}!", "panic");
+ */
+template <typename... T>
+FMT_INLINE void print(FILE* f, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  if (!detail::use_utf8()) return detail::vprint_mojibake(f, fmt, vargs);
+  return detail::is_locking<T...>() ? vprint_buffered(f, fmt, vargs)
+                                    : vprint(f, fmt, vargs);
+}
+
+/// Formats `args` according to specifications in `fmt` and writes the output
+/// to the file `f` followed by a newline.
+template <typename... T>
+FMT_INLINE void println(FILE* f, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  return detail::use_utf8() ? vprintln(f, fmt, vargs)
+                            : detail::vprint_mojibake(f, fmt, vargs, true);
+}
+
+/// Formats `args` according to specifications in `fmt` and writes the output
+/// to `stdout` followed by a newline.
+template <typename... T>
+FMT_INLINE void println(format_string<T...> fmt, T&&... args) {
+  return fmt::println(stdout, fmt, static_cast<T&&>(args)...);
+}
+
+FMT_END_EXPORT
+FMT_GCC_PRAGMA("GCC pop_options")
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  include "format.h"
+#endif
+#endif  // FMT_BASE_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/chrono.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/chrono.h
index 1a8d8d04c2aa..c93123fd3353 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/chrono.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/chrono.h
@@ -8,20 +8,31 @@
 #ifndef FMT_CHRONO_H_
 #define FMT_CHRONO_H_
 
-#include <algorithm>
-#include <chrono>
-#include <cmath>    // std::isfinite
-#include <cstring>  // std::memcpy
-#include <ctime>
-#include <iterator>
-#include <locale>
-#include <ostream>
-#include <type_traits>
+#ifndef FMT_MODULE
+#  include <algorithm>
+#  include <chrono>
+#  include <cmath>    // std::isfinite
+#  include <cstring>  // std::memcpy
+#  include <ctime>
+#  include <iterator>
+#  include <locale>
+#  include <ostream>
+#  include <type_traits>
+#endif
 
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
 
+// Check if std::chrono::local_t is available.
+#ifndef FMT_USE_LOCAL_TIME
+#  ifdef __cpp_lib_chrono
+#    define FMT_USE_LOCAL_TIME (__cpp_lib_chrono >= 201907L)
+#  else
+#    define FMT_USE_LOCAL_TIME 0
+#  endif
+#endif
+
 // Check if std::chrono::utc_timestamp is available.
 #ifndef FMT_USE_UTC_TIME
 #  ifdef __cpp_lib_chrono
@@ -63,7 +74,8 @@ template <typename To, typename From,
           FMT_ENABLE_IF(!std::is_same<From, To>::value &&
                         std::numeric_limits<From>::is_signed ==
                             std::numeric_limits<To>::is_signed)>
-FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
   ec = 0;
   using F = std::numeric_limits<From>;
   using T = std::numeric_limits<To>;
@@ -84,15 +96,14 @@ FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
   return static_cast<To>(from);
 }
 
-/**
- * converts From to To, without loss. If the dynamic value of from
- * can't be converted to To without loss, ec is set.
- */
+/// Converts From to To, without loss. If the dynamic value of from
+/// can't be converted to To without loss, ec is set.
 template <typename To, typename From,
           FMT_ENABLE_IF(!std::is_same<From, To>::value &&
                         std::numeric_limits<From>::is_signed !=
                             std::numeric_limits<To>::is_signed)>
-FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
   ec = 0;
   using F = std::numeric_limits<From>;
   using T = std::numeric_limits<To>;
@@ -124,7 +135,8 @@ FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
 
 template <typename To, typename From,
           FMT_ENABLE_IF(std::is_same<From, To>::value)>
-FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
   ec = 0;
   return from;
 }  // function
@@ -145,7 +157,7 @@ FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
 // clang-format on
 template <typename To, typename From,
           FMT_ENABLE_IF(!std::is_same<From, To>::value)>
-FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
   ec = 0;
   using T = std::numeric_limits<To>;
   static_assert(std::is_floating_point<From>::value, "From must be floating");
@@ -167,20 +179,18 @@ FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
 
 template <typename To, typename From,
           FMT_ENABLE_IF(std::is_same<From, To>::value)>
-FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) {
+FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
   ec = 0;
   static_assert(std::is_floating_point<From>::value, "From must be floating");
   return from;
 }
 
-/**
- * safe duration cast between integral durations
- */
+/// Safe duration cast between integral durations
 template <typename To, typename FromRep, typename FromPeriod,
           FMT_ENABLE_IF(std::is_integral<FromRep>::value),
           FMT_ENABLE_IF(std::is_integral<typename To::rep>::value)>
-To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
-                      int& ec) {
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
   using From = std::chrono::duration<FromRep, FromPeriod>;
   ec = 0;
   // the basic idea is that we need to convert from count() in the from type
@@ -212,7 +222,8 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
     }
     const auto min1 =
         (std::numeric_limits<IntermediateRep>::min)() / Factor::num;
-    if (!std::is_unsigned<IntermediateRep>::value && count < min1) {
+    if (detail::const_check(!std::is_unsigned<IntermediateRep>::value) &&
+        count < min1) {
       ec = 1;
       return {};
     }
@@ -224,14 +235,12 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
   return ec ? To() : To(tocount);
 }
 
-/**
- * safe duration_cast between floating point durations
- */
+/// Safe duration_cast between floating point durations
 template <typename To, typename FromRep, typename FromPeriod,
           FMT_ENABLE_IF(std::is_floating_point<FromRep>::value),
           FMT_ENABLE_IF(std::is_floating_point<typename To::rep>::value)>
-To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
-                      int& ec) {
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
   using From = std::chrono::duration<FromRep, FromPeriod>;
   ec = 0;
   if (std::isnan(from.count())) {
@@ -311,12 +320,45 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
 
 namespace detail {
 template <typename T = void> struct null {};
-inline null<> localtime_r FMT_NOMACRO(...) { return null<>(); }
-inline null<> localtime_s(...) { return null<>(); }
-inline null<> gmtime_r(...) { return null<>(); }
-inline null<> gmtime_s(...) { return null<>(); }
+inline auto localtime_r FMT_NOMACRO(...) -> null<> { return null<>(); }
+inline auto localtime_s(...) -> null<> { return null<>(); }
+inline auto gmtime_r(...) -> null<> { return null<>(); }
+inline auto gmtime_s(...) -> null<> { return null<>(); }
+
+// It is defined here and not in ostream.h because the latter has expensive
+// includes.
+template <typename Streambuf> class formatbuf : public Streambuf {
+ private:
+  using char_type = typename Streambuf::char_type;
+  using streamsize = decltype(std::declval<Streambuf>().sputn(nullptr, 0));
+  using int_type = typename Streambuf::int_type;
+  using traits_type = typename Streambuf::traits_type;
+
+  buffer<char_type>& buffer_;
+
+ public:
+  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
+
+ protected:
+  // The put area is always empty. This makes the implementation simpler and has
+  // the advantage that the streambuf and the buffer are always in sync and
+  // sputc never writes into uninitialized memory. A disadvantage is that each
+  // call to sputc always results in a (virtual) call to overflow. There is no
+  // disadvantage here for sputn since this always results in a call to xsputn.
+
+  auto overflow(int_type ch) -> int_type override {
+    if (!traits_type::eq_int_type(ch, traits_type::eof()))
+      buffer_.push_back(static_cast<char_type>(ch));
+    return ch;
+  }
+
+  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
+    buffer_.append(s, s + count);
+    return count;
+  }
+};
 
-inline const std::locale& get_classic_locale() {
+inline auto get_classic_locale() -> const std::locale& {
   static const auto& locale = std::locale::classic();
   return locale;
 }
@@ -326,8 +368,6 @@ template <typename CodeUnit> struct codecvt_result {
   CodeUnit buf[max_size];
   CodeUnit* end;
 };
-template <typename CodeUnit>
-constexpr const size_t codecvt_result<CodeUnit>::max_size;
 
 template <typename CodeUnit>
 void write_codecvt(codecvt_result<CodeUnit>& out, string_view in_buf,
@@ -351,11 +391,12 @@ void write_codecvt(codecvt_result<CodeUnit>& out, string_view in_buf,
 template <typename OutputIt>
 auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc)
     -> OutputIt {
-  if (detail::is_utf8() && loc != get_classic_locale()) {
+  if (detail::use_utf8() && loc != get_classic_locale()) {
     // char16_t and char32_t codecvts are broken in MSVC (linkage errors) and
     // gcc-4.
-#if FMT_MSC_VERSION != 0 || \
-    (defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI))
+#if FMT_MSC_VERSION != 0 ||  \
+    (defined(__GLIBCXX__) && \
+     (!defined(_GLIBCXX_USE_DUAL_ABI) || _GLIBCXX_USE_DUAL_ABI == 0))
     // The _GLIBCXX_USE_DUAL_ABI macro is always defined in libstdc++ from gcc-5
     // and newer.
     using code_unit = wchar_t;
@@ -367,39 +408,13 @@ auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc)
     unit_t unit;
     write_codecvt(unit, in, loc);
     // In UTF-8 is used one to four one-byte code units.
-    auto&& buf = basic_memory_buffer<char, unit_t::max_size * 4>();
-    for (code_unit* p = unit.buf; p != unit.end; ++p) {
-      uint32_t c = static_cast<uint32_t>(*p);
-      if (sizeof(code_unit) == 2 && c >= 0xd800 && c <= 0xdfff) {
-        // surrogate pair
-        ++p;
-        if (p == unit.end || (c & 0xfc00) != 0xd800 ||
-            (*p & 0xfc00) != 0xdc00) {
-          FMT_THROW(format_error("failed to format time"));
-        }
-        c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
-      }
-      if (c < 0x80) {
-        buf.push_back(static_cast<char>(c));
-      } else if (c < 0x800) {
-        buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
-        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
-      } else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
-        buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
-        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
-        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
-      } else if (c >= 0x10000 && c <= 0x10ffff) {
-        buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
-        buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
-        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
-        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
-      } else {
-        FMT_THROW(format_error("failed to format time"));
-      }
-    }
-    return copy_str<char>(buf.data(), buf.data() + buf.size(), out);
+    auto u =
+        to_utf8<code_unit, basic_memory_buffer<char, unit_t::max_size * 4>>();
+    if (!u.convert({unit.buf, to_unsigned(unit.end - unit.buf)}))
+      FMT_THROW(format_error("failed to format time"));
+    return copy<char>(u.c_str(), u.c_str() + u.size(), out);
   }
-  return copy_str<char>(in.data(), in.data() + in.size(), out);
+  return copy<char>(in.data(), in.data() + in.size(), out);
 }
 
 template <typename Char, typename OutputIt,
@@ -408,7 +423,7 @@ auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc)
     -> OutputIt {
   codecvt_result<Char> unit;
   write_codecvt(unit, sv, loc);
-  return copy_str<Char>(unit.buf, unit.end, out);
+  return copy<Char>(unit.buf, unit.end, out);
 }
 
 template <typename Char, typename OutputIt,
@@ -424,8 +439,7 @@ inline void do_write(buffer<Char>& buf, const std::tm& time,
   auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
   auto&& os = std::basic_ostream<Char>(&format_buf);
   os.imbue(loc);
-  using iterator = std::ostreambuf_iterator<Char>;
-  const auto& facet = std::use_facet<std::time_put<Char, iterator>>(loc);
+  const auto& facet = std::use_facet<std::time_put<Char>>(loc);
   auto end = facet.put(os, os, Char(' '), &time, format, modifier);
   if (end.failed()) FMT_THROW(format_error("failed to format time"));
 }
@@ -448,38 +462,83 @@ auto write(OutputIt out, const std::tm& time, const std::locale& loc,
   return write_encoded_tm_str(out, string_view(buf.data(), buf.size()), loc);
 }
 
+template <typename Rep1, typename Rep2>
+struct is_same_arithmetic_type
+    : public std::integral_constant<bool,
+                                    (std::is_integral<Rep1>::value &&
+                                     std::is_integral<Rep2>::value) ||
+                                        (std::is_floating_point<Rep1>::value &&
+                                         std::is_floating_point<Rep2>::value)> {
+};
+
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+#if FMT_SAFE_DURATION_CAST
+  // Throwing version of safe_duration_cast is only available for
+  // integer to integer or float to float casts.
+  int ec;
+  To to = safe_duration_cast::safe_duration_cast<To>(from, ec);
+  if (ec) FMT_THROW(format_error("cannot format duration"));
+  return to;
+#else
+  // Standard duration cast, may overflow.
+  return std::chrono::duration_cast<To>(from);
+#endif
+}
+
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(!is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+  // Mixed integer <-> float cast is not supported by safe_duration_cast.
+  return std::chrono::duration_cast<To>(from);
+}
+
+template <typename Duration>
+auto to_time_t(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::time_t {
+  // Cannot use std::chrono::system_clock::to_time_t since this would first
+  // require a cast to std::chrono::system_clock::time_point, which could
+  // overflow.
+  return fmt_duration_cast<std::chrono::duration<std::time_t>>(
+             time_point.time_since_epoch())
+      .count();
+}
 }  // namespace detail
 
-FMT_MODULE_EXPORT_BEGIN
+FMT_BEGIN_EXPORT
 
 /**
-  Converts given time since epoch as ``std::time_t`` value into calendar time,
-  expressed in local time. Unlike ``std::localtime``, this function is
-  thread-safe on most platforms.
+ * Converts given time since epoch as `std::time_t` value into calendar time,
+ * expressed in local time. Unlike `std::localtime`, this function is
+ * thread-safe on most platforms.
  */
-inline std::tm localtime(std::time_t time) {
+inline auto localtime(std::time_t time) -> std::tm {
   struct dispatcher {
     std::time_t time_;
     std::tm tm_;
 
     dispatcher(std::time_t t) : time_(t) {}
 
-    bool run() {
+    auto run() -> bool {
       using namespace fmt::detail;
       return handle(localtime_r(&time_, &tm_));
     }
 
-    bool handle(std::tm* tm) { return tm != nullptr; }
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
 
-    bool handle(detail::null<>) {
+    auto handle(detail::null<>) -> bool {
       using namespace fmt::detail;
       return fallback(localtime_s(&tm_, &time_));
     }
 
-    bool fallback(int res) { return res == 0; }
+    auto fallback(int res) -> bool { return res == 0; }
 
 #if !FMT_MSC_VERSION
-    bool fallback(detail::null<>) {
+    auto fallback(detail::null<>) -> bool {
       using namespace fmt::detail;
       std::tm* tm = std::localtime(&time_);
       if (tm) tm_ = *tm;
@@ -493,57 +552,62 @@ inline std::tm localtime(std::time_t time) {
   return lt.tm_;
 }
 
-inline std::tm localtime(
-    std::chrono::time_point<std::chrono::system_clock> time_point) {
-  return localtime(std::chrono::system_clock::to_time_t(time_point));
+#if FMT_USE_LOCAL_TIME
+template <typename Duration>
+inline auto localtime(std::chrono::local_time<Duration> time) -> std::tm {
+  return localtime(
+      detail::to_time_t(std::chrono::current_zone()->to_sys(time)));
 }
+#endif
 
 /**
-  Converts given time since epoch as ``std::time_t`` value into calendar time,
-  expressed in Coordinated Universal Time (UTC). Unlike ``std::gmtime``, this
-  function is thread-safe on most platforms.
+ * Converts given time since epoch as `std::time_t` value into calendar time,
+ * expressed in Coordinated Universal Time (UTC). Unlike `std::gmtime`, this
+ * function is thread-safe on most platforms.
  */
-inline std::tm gmtime(std::time_t time) {
+inline auto gmtime(std::time_t time) -> std::tm {
   struct dispatcher {
     std::time_t time_;
     std::tm tm_;
 
     dispatcher(std::time_t t) : time_(t) {}
 
-    bool run() {
+    auto run() -> bool {
       using namespace fmt::detail;
       return handle(gmtime_r(&time_, &tm_));
     }
 
-    bool handle(std::tm* tm) { return tm != nullptr; }
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
 
-    bool handle(detail::null<>) {
+    auto handle(detail::null<>) -> bool {
       using namespace fmt::detail;
       return fallback(gmtime_s(&tm_, &time_));
     }
 
-    bool fallback(int res) { return res == 0; }
+    auto fallback(int res) -> bool { return res == 0; }
 
 #if !FMT_MSC_VERSION
-    bool fallback(detail::null<>) {
+    auto fallback(detail::null<>) -> bool {
       std::tm* tm = std::gmtime(&time_);
       if (tm) tm_ = *tm;
       return tm != nullptr;
     }
 #endif
   };
-  dispatcher gt(time);
+  auto gt = dispatcher(time);
   // Too big time values may be unsupported.
   if (!gt.run()) FMT_THROW(format_error("time_t value out of range"));
   return gt.tm_;
 }
 
-inline std::tm gmtime(
-    std::chrono::time_point<std::chrono::system_clock> time_point) {
-  return gmtime(std::chrono::system_clock::to_time_t(time_point));
+template <typename Duration>
+inline auto gmtime(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::tm {
+  return gmtime(detail::to_time_t(time_point));
 }
 
-FMT_BEGIN_DETAIL_NAMESPACE
+namespace detail {
 
 // Writes two-digit numbers a, b and c separated by sep to buf.
 // The method by Pavel Novikov based on
@@ -579,7 +643,8 @@ inline void write_digit2_separated(char* buf, unsigned a, unsigned b,
   }
 }
 
-template <typename Period> FMT_CONSTEXPR inline const char* get_units() {
+template <typename Period>
+FMT_CONSTEXPR inline auto get_units() -> const char* {
   if (std::is_same<Period, std::atto>::value) return "as";
   if (std::is_same<Period, std::femto>::value) return "fs";
   if (std::is_same<Period, std::pico>::value) return "ps";
@@ -597,8 +662,9 @@ template <typename Period> FMT_CONSTEXPR inline const char* get_units() {
   if (std::is_same<Period, std::tera>::value) return "Ts";
   if (std::is_same<Period, std::peta>::value) return "Ps";
   if (std::is_same<Period, std::exa>::value) return "Es";
-  if (std::is_same<Period, std::ratio<60>>::value) return "m";
+  if (std::is_same<Period, std::ratio<60>>::value) return "min";
   if (std::is_same<Period, std::ratio<3600>>::value) return "h";
+  if (std::is_same<Period, std::ratio<86400>>::value) return "d";
   return nullptr;
 }
 
@@ -608,13 +674,37 @@ enum class numeric_system {
   alternative
 };
 
+// Glibc extensions for formatting numeric values.
+enum class pad_type {
+  // Pad a numeric result string with zeros (the default).
+  zero,
+  // Do not pad a numeric result string.
+  none,
+  // Pad a numeric result string with spaces.
+  space,
+};
+
+template <typename OutputIt>
+auto write_padding(OutputIt out, pad_type pad, int width) -> OutputIt {
+  if (pad == pad_type::none) return out;
+  return detail::fill_n(out, width, pad == pad_type::space ? ' ' : '0');
+}
+
+template <typename OutputIt>
+auto write_padding(OutputIt out, pad_type pad) -> OutputIt {
+  if (pad != pad_type::none) *out++ = pad == pad_type::space ? ' ' : '0';
+  return out;
+}
+
 // Parses a put_time-like format string and invokes handler actions.
 template <typename Char, typename Handler>
-FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
-                                              const Char* end,
-                                              Handler&& handler) {
+FMT_CONSTEXPR auto parse_chrono_format(const Char* begin, const Char* end,
+                                       Handler&& handler) -> const Char* {
+  if (begin == end || *begin == '}') return begin;
+  if (*begin != '%') FMT_THROW(format_error("invalid format"));
   auto ptr = begin;
   while (ptr != end) {
+    pad_type pad = pad_type::zero;
     auto c = *ptr;
     if (c == '}') break;
     if (c != '%') {
@@ -624,6 +714,18 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
     if (begin != ptr) handler.on_text(begin, ptr);
     ++ptr;  // consume '%'
     if (ptr == end) FMT_THROW(format_error("invalid format"));
+    c = *ptr;
+    switch (c) {
+    case '_':
+      pad = pad_type::space;
+      ++ptr;
+      break;
+    case '-':
+      pad = pad_type::none;
+      ++ptr;
+      break;
+    }
+    if (ptr == end) FMT_THROW(format_error("invalid format"));
     c = *ptr++;
     switch (c) {
     case '%':
@@ -681,35 +783,35 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
       break;
     // Day of the year/month:
     case 'U':
-      handler.on_dec0_week_of_year(numeric_system::standard);
+      handler.on_dec0_week_of_year(numeric_system::standard, pad);
       break;
     case 'W':
-      handler.on_dec1_week_of_year(numeric_system::standard);
+      handler.on_dec1_week_of_year(numeric_system::standard, pad);
       break;
     case 'V':
-      handler.on_iso_week_of_year(numeric_system::standard);
+      handler.on_iso_week_of_year(numeric_system::standard, pad);
       break;
     case 'j':
       handler.on_day_of_year();
       break;
     case 'd':
-      handler.on_day_of_month(numeric_system::standard);
+      handler.on_day_of_month(numeric_system::standard, pad);
       break;
     case 'e':
-      handler.on_day_of_month_space(numeric_system::standard);
+      handler.on_day_of_month(numeric_system::standard, pad_type::space);
       break;
     // Hour, minute, second:
     case 'H':
-      handler.on_24_hour(numeric_system::standard);
+      handler.on_24_hour(numeric_system::standard, pad);
       break;
     case 'I':
-      handler.on_12_hour(numeric_system::standard);
+      handler.on_12_hour(numeric_system::standard, pad);
       break;
     case 'M':
-      handler.on_minute(numeric_system::standard);
+      handler.on_minute(numeric_system::standard, pad);
       break;
     case 'S':
-      handler.on_second(numeric_system::standard);
+      handler.on_second(numeric_system::standard, pad);
       break;
     // Other:
     case 'c':
@@ -746,7 +848,7 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
       handler.on_duration_unit();
       break;
     case 'z':
-      handler.on_utc_offset();
+      handler.on_utc_offset(numeric_system::standard);
       break;
     case 'Z':
       handler.on_tz_name();
@@ -774,6 +876,9 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
       case 'X':
         handler.on_loc_time(numeric_system::alternative);
         break;
+      case 'z':
+        handler.on_utc_offset(numeric_system::alternative);
+        break;
       default:
         FMT_THROW(format_error("invalid format"));
       }
@@ -790,19 +895,19 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
         handler.on_dec_month(numeric_system::alternative);
         break;
       case 'U':
-        handler.on_dec0_week_of_year(numeric_system::alternative);
+        handler.on_dec0_week_of_year(numeric_system::alternative, pad);
         break;
       case 'W':
-        handler.on_dec1_week_of_year(numeric_system::alternative);
+        handler.on_dec1_week_of_year(numeric_system::alternative, pad);
         break;
       case 'V':
-        handler.on_iso_week_of_year(numeric_system::alternative);
+        handler.on_iso_week_of_year(numeric_system::alternative, pad);
         break;
       case 'd':
-        handler.on_day_of_month(numeric_system::alternative);
+        handler.on_day_of_month(numeric_system::alternative, pad);
         break;
       case 'e':
-        handler.on_day_of_month_space(numeric_system::alternative);
+        handler.on_day_of_month(numeric_system::alternative, pad_type::space);
         break;
       case 'w':
         handler.on_dec0_weekday(numeric_system::alternative);
@@ -811,16 +916,19 @@ FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin,
         handler.on_dec1_weekday(numeric_system::alternative);
         break;
       case 'H':
-        handler.on_24_hour(numeric_system::alternative);
+        handler.on_24_hour(numeric_system::alternative, pad);
         break;
       case 'I':
-        handler.on_12_hour(numeric_system::alternative);
+        handler.on_12_hour(numeric_system::alternative, pad);
         break;
       case 'M':
-        handler.on_minute(numeric_system::alternative);
+        handler.on_minute(numeric_system::alternative, pad);
         break;
       case 'S':
-        handler.on_second(numeric_system::alternative);
+        handler.on_second(numeric_system::alternative, pad);
+        break;
+      case 'z':
+        handler.on_utc_offset(numeric_system::alternative);
         break;
       default:
         FMT_THROW(format_error("invalid format"));
@@ -852,12 +960,19 @@ template <typename Derived> struct null_chrono_spec_handler {
   FMT_CONSTEXPR void on_abbr_month() { unsupported(); }
   FMT_CONSTEXPR void on_full_month() { unsupported(); }
   FMT_CONSTEXPR void on_dec_month(numeric_system) { unsupported(); }
-  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) { unsupported(); }
-  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) { unsupported(); }
-  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system, pad_type) {
+    unsupported();
+  }
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system, pad_type) {
+    unsupported();
+  }
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system, pad_type) {
+    unsupported();
+  }
   FMT_CONSTEXPR void on_day_of_year() { unsupported(); }
-  FMT_CONSTEXPR void on_day_of_month(numeric_system) { unsupported(); }
-  FMT_CONSTEXPR void on_day_of_month_space(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_month(numeric_system, pad_type) {
+    unsupported();
+  }
   FMT_CONSTEXPR void on_24_hour(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_12_hour(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_minute(numeric_system) { unsupported(); }
@@ -873,7 +988,7 @@ template <typename Derived> struct null_chrono_spec_handler {
   FMT_CONSTEXPR void on_am_pm() { unsupported(); }
   FMT_CONSTEXPR void on_duration_value() { unsupported(); }
   FMT_CONSTEXPR void on_duration_unit() { unsupported(); }
-  FMT_CONSTEXPR void on_utc_offset() { unsupported(); }
+  FMT_CONSTEXPR void on_utc_offset(numeric_system) { unsupported(); }
   FMT_CONSTEXPR void on_tz_name() { unsupported(); }
 };
 
@@ -895,16 +1010,15 @@ struct tm_format_checker : null_chrono_spec_handler<tm_format_checker> {
   FMT_CONSTEXPR void on_abbr_month() {}
   FMT_CONSTEXPR void on_full_month() {}
   FMT_CONSTEXPR void on_dec_month(numeric_system) {}
-  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) {}
-  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) {}
-  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_day_of_year() {}
-  FMT_CONSTEXPR void on_day_of_month(numeric_system) {}
-  FMT_CONSTEXPR void on_day_of_month_space(numeric_system) {}
-  FMT_CONSTEXPR void on_24_hour(numeric_system) {}
-  FMT_CONSTEXPR void on_12_hour(numeric_system) {}
-  FMT_CONSTEXPR void on_minute(numeric_system) {}
-  FMT_CONSTEXPR void on_second(numeric_system) {}
+  FMT_CONSTEXPR void on_day_of_month(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_second(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_datetime(numeric_system) {}
   FMT_CONSTEXPR void on_loc_date(numeric_system) {}
   FMT_CONSTEXPR void on_loc_time(numeric_system) {}
@@ -914,29 +1028,29 @@ struct tm_format_checker : null_chrono_spec_handler<tm_format_checker> {
   FMT_CONSTEXPR void on_24_hour_time() {}
   FMT_CONSTEXPR void on_iso_time() {}
   FMT_CONSTEXPR void on_am_pm() {}
-  FMT_CONSTEXPR void on_utc_offset() {}
+  FMT_CONSTEXPR void on_utc_offset(numeric_system) {}
   FMT_CONSTEXPR void on_tz_name() {}
 };
 
-inline const char* tm_wday_full_name(int wday) {
+inline auto tm_wday_full_name(int wday) -> const char* {
   static constexpr const char* full_name_list[] = {
       "Sunday",   "Monday", "Tuesday", "Wednesday",
       "Thursday", "Friday", "Saturday"};
   return wday >= 0 && wday <= 6 ? full_name_list[wday] : "?";
 }
-inline const char* tm_wday_short_name(int wday) {
+inline auto tm_wday_short_name(int wday) -> const char* {
   static constexpr const char* short_name_list[] = {"Sun", "Mon", "Tue", "Wed",
                                                     "Thu", "Fri", "Sat"};
   return wday >= 0 && wday <= 6 ? short_name_list[wday] : "???";
 }
 
-inline const char* tm_mon_full_name(int mon) {
+inline auto tm_mon_full_name(int mon) -> const char* {
   static constexpr const char* full_name_list[] = {
       "January", "February", "March",     "April",   "May",      "June",
       "July",    "August",   "September", "October", "November", "December"};
   return mon >= 0 && mon <= 11 ? full_name_list[mon] : "?";
 }
-inline const char* tm_mon_short_name(int mon) {
+inline auto tm_mon_short_name(int mon) -> const char* {
   static constexpr const char* short_name_list[] = {
       "Jan", "Feb", "Mar", "Apr", "May", "Jun",
       "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
@@ -966,13 +1080,135 @@ inline void tzset_once() {
 }
 #endif
 
-template <typename OutputIt, typename Char> class tm_writer {
+// Converts value to Int and checks that it's in the range [0, upper).
+template <typename T, typename Int, FMT_ENABLE_IF(std::is_integral<T>::value)>
+inline auto to_nonnegative_int(T value, Int upper) -> Int {
+  if (!std::is_unsigned<Int>::value &&
+      (value < 0 || to_unsigned(value) > to_unsigned(upper))) {
+    FMT_THROW(fmt::format_error("chrono value is out of range"));
+  }
+  return static_cast<Int>(value);
+}
+template <typename T, typename Int, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+inline auto to_nonnegative_int(T value, Int upper) -> Int {
+  auto int_value = static_cast<Int>(value);
+  if (int_value < 0 || value > static_cast<T>(upper))
+    FMT_THROW(format_error("invalid value"));
+  return int_value;
+}
+
+constexpr auto pow10(std::uint32_t n) -> long long {
+  return n == 0 ? 1 : 10 * pow10(n - 1);
+}
+
+// Counts the number of fractional digits in the range [0, 18] according to the
+// C++20 spec. If more than 18 fractional digits are required then returns 6 for
+// microseconds precision.
+template <long long Num, long long Den, int N = 0,
+          bool Enabled = (N < 19) && (Num <= max_value<long long>() / 10)>
+struct count_fractional_digits {
+  static constexpr int value =
+      Num % Den == 0 ? N : count_fractional_digits<Num * 10, Den, N + 1>::value;
+};
+
+// Base case that doesn't instantiate any more templates
+// in order to avoid overflow.
+template <long long Num, long long Den, int N>
+struct count_fractional_digits<Num, Den, N, false> {
+  static constexpr int value = (Num % Den == 0) ? N : 6;
+};
+
+// Format subseconds which are given as an integer type with an appropriate
+// number of digits.
+template <typename Char, typename OutputIt, typename Duration>
+void write_fractional_seconds(OutputIt& out, Duration d, int precision = -1) {
+  constexpr auto num_fractional_digits =
+      count_fractional_digits<Duration::period::num,
+                              Duration::period::den>::value;
+
+  using subsecond_precision = std::chrono::duration<
+      typename std::common_type<typename Duration::rep,
+                                std::chrono::seconds::rep>::type,
+      std::ratio<1, detail::pow10(num_fractional_digits)>>;
+
+  const auto fractional = d - fmt_duration_cast<std::chrono::seconds>(d);
+  const auto subseconds =
+      std::chrono::treat_as_floating_point<
+          typename subsecond_precision::rep>::value
+          ? fractional.count()
+          : fmt_duration_cast<subsecond_precision>(fractional).count();
+  auto n = static_cast<uint32_or_64_or_128_t<long long>>(subseconds);
+  const int num_digits = detail::count_digits(n);
+
+  int leading_zeroes = (std::max)(0, num_fractional_digits - num_digits);
+  if (precision < 0) {
+    FMT_ASSERT(!std::is_floating_point<typename Duration::rep>::value, "");
+    if (std::ratio_less<typename subsecond_precision::period,
+                        std::chrono::seconds::period>::value) {
+      *out++ = '.';
+      out = detail::fill_n(out, leading_zeroes, '0');
+      out = format_decimal<Char>(out, n, num_digits).end;
+    }
+  } else if (precision > 0) {
+    *out++ = '.';
+    leading_zeroes = (std::min)(leading_zeroes, precision);
+    int remaining = precision - leading_zeroes;
+    out = detail::fill_n(out, leading_zeroes, '0');
+    if (remaining < num_digits) {
+      int num_truncated_digits = num_digits - remaining;
+      n /= to_unsigned(detail::pow10(to_unsigned(num_truncated_digits)));
+      if (n) {
+        out = format_decimal<Char>(out, n, remaining).end;
+      }
+      return;
+    }
+    if (n) {
+      out = format_decimal<Char>(out, n, num_digits).end;
+      remaining -= num_digits;
+    }
+    out = detail::fill_n(out, remaining, '0');
+  }
+}
+
+// Format subseconds which are given as a floating point type with an
+// appropriate number of digits. We cannot pass the Duration here, as we
+// explicitly need to pass the Rep value in the chrono_formatter.
+template <typename Duration>
+void write_floating_seconds(memory_buffer& buf, Duration duration,
+                            int num_fractional_digits = -1) {
+  using rep = typename Duration::rep;
+  FMT_ASSERT(std::is_floating_point<rep>::value, "");
+
+  auto val = duration.count();
+
+  if (num_fractional_digits < 0) {
+    // For `std::round` with fallback to `round`:
+    // On some toolchains `std::round` is not available (e.g. GCC 6).
+    using namespace std;
+    num_fractional_digits =
+        count_fractional_digits<Duration::period::num,
+                                Duration::period::den>::value;
+    if (num_fractional_digits < 6 && static_cast<rep>(round(val)) != val)
+      num_fractional_digits = 6;
+  }
+
+  fmt::format_to(std::back_inserter(buf), FMT_STRING("{:.{}f}"),
+                 std::fmod(val * static_cast<rep>(Duration::period::num) /
+                               static_cast<rep>(Duration::period::den),
+                           static_cast<rep>(60)),
+                 num_fractional_digits);
+}
+
+template <typename OutputIt, typename Char,
+          typename Duration = std::chrono::seconds>
+class tm_writer {
  private:
   static constexpr int days_per_week = 7;
 
   const std::locale& loc_;
   const bool is_classic_;
   OutputIt out_;
+  const Duration* subsecs_;
   const std::tm& tm_;
 
   auto tm_sec() const noexcept -> int {
@@ -1021,8 +1257,7 @@ template <typename OutputIt, typename Char> class tm_writer {
     return static_cast<int>(l);
   }
 
-  // Algorithm:
-  // https://en.wikipedia.org/wiki/ISO_week_date#Calculating_the_week_number_from_a_month_and_day_of_the_month_or_ordinal_date
+  // Algorithm: https://en.wikipedia.org/wiki/ISO_week_date.
   auto iso_year_weeks(long long curr_year) const noexcept -> int {
     const auto prev_year = curr_year - 1;
     const auto curr_p =
@@ -1060,6 +1295,17 @@ template <typename OutputIt, typename Char> class tm_writer {
     *out_++ = *d++;
     *out_++ = *d;
   }
+  void write2(int value, pad_type pad) {
+    unsigned int v = to_unsigned(value) % 100;
+    if (v >= 10) {
+      const char* d = digits2(v);
+      *out_++ = *d++;
+      *out_++ = *d;
+    } else {
+      out_ = detail::write_padding(out_, pad);
+      *out_++ = static_cast<char>('0' + v);
+    }
+  }
 
   void write_year_extended(long long year) {
     // At least 4 characters.
@@ -1071,7 +1317,8 @@ template <typename OutputIt, typename Char> class tm_writer {
     }
     uint32_or_64_or_128_t<long long> n = to_unsigned(year);
     const int num_digits = count_digits(n);
-    if (width > num_digits) out_ = std::fill_n(out_, width - num_digits, '0');
+    if (width > num_digits)
+      out_ = detail::fill_n(out_, width - num_digits, '0');
     out_ = format_decimal<Char>(out_, n, num_digits).end;
   }
   void write_year(long long year) {
@@ -1083,7 +1330,7 @@ template <typename OutputIt, typename Char> class tm_writer {
     }
   }
 
-  void write_utc_offset(long offset) {
+  void write_utc_offset(long offset, numeric_system ns) {
     if (offset < 0) {
       *out_++ = '-';
       offset = -offset;
@@ -1092,14 +1339,15 @@ template <typename OutputIt, typename Char> class tm_writer {
     }
     offset /= 60;
     write2(static_cast<int>(offset / 60));
+    if (ns != numeric_system::standard) *out_++ = ':';
     write2(static_cast<int>(offset % 60));
   }
   template <typename T, FMT_ENABLE_IF(has_member_data_tm_gmtoff<T>::value)>
-  void format_utc_offset_impl(const T& tm) {
-    write_utc_offset(tm.tm_gmtoff);
+  void format_utc_offset_impl(const T& tm, numeric_system ns) {
+    write_utc_offset(tm.tm_gmtoff, ns);
   }
   template <typename T, FMT_ENABLE_IF(!has_member_data_tm_gmtoff<T>::value)>
-  void format_utc_offset_impl(const T& tm) {
+  void format_utc_offset_impl(const T& tm, numeric_system ns) {
 #if defined(_WIN32) && defined(_UCRT)
 #  if FMT_USE_TZSET
     tzset_once();
@@ -1111,10 +1359,17 @@ template <typename OutputIt, typename Char> class tm_writer {
       _get_dstbias(&dstbias);
       offset += dstbias;
     }
-    write_utc_offset(-offset);
+    write_utc_offset(-offset, ns);
 #else
-    ignore_unused(tm);
-    format_localized('z');
+    if (ns == numeric_system::standard) return format_localized('z');
+
+    // Extract timezone offset from timezone conversion functions.
+    std::tm gtm = tm;
+    std::time_t gt = std::mktime(&gtm);
+    std::tm ltm = gmtime(gt);
+    std::time_t lt = std::mktime(&ltm);
+    long offset = gt - lt;
+    write_utc_offset(offset, ns);
 #endif
   }
 
@@ -1135,16 +1390,18 @@ template <typename OutputIt, typename Char> class tm_writer {
   }
 
  public:
-  tm_writer(const std::locale& loc, OutputIt out, const std::tm& tm)
+  tm_writer(const std::locale& loc, OutputIt out, const std::tm& tm,
+            const Duration* subsecs = nullptr)
       : loc_(loc),
         is_classic_(loc_ == get_classic_locale()),
         out_(out),
+        subsecs_(subsecs),
         tm_(tm) {}
 
-  OutputIt out() const { return out_; }
+  auto out() const -> OutputIt { return out_; }
 
   FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
-    out_ = copy_str<Char>(begin, end, out_);
+    out_ = copy<Char>(begin, end, out_);
   }
 
   void on_abbr_weekday() {
@@ -1191,7 +1448,7 @@ template <typename OutputIt, typename Char> class tm_writer {
       *out_++ = ' ';
       on_abbr_month();
       *out_++ = ' ';
-      on_day_of_month_space(numeric_system::standard);
+      on_day_of_month(numeric_system::standard, pad_type::space);
       *out_++ = ' ';
       on_iso_time();
       *out_++ = ' ';
@@ -1217,7 +1474,7 @@ template <typename OutputIt, typename Char> class tm_writer {
     write_digit2_separated(buf, to_unsigned(tm_mon() + 1),
                            to_unsigned(tm_mday()),
                            to_unsigned(split_year_lower(tm_year())), '/');
-    out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
+    out_ = copy<Char>(std::begin(buf), std::end(buf), out_);
   }
   void on_iso_date() {
     auto year = tm_year();
@@ -1233,10 +1490,10 @@ template <typename OutputIt, typename Char> class tm_writer {
     write_digit2_separated(buf + 2, static_cast<unsigned>(year % 100),
                            to_unsigned(tm_mon() + 1), to_unsigned(tm_mday()),
                            '-');
-    out_ = copy_str<Char>(std::begin(buf) + offset, std::end(buf), out_);
+    out_ = copy<Char>(std::begin(buf) + offset, std::end(buf), out_);
   }
 
-  void on_utc_offset() { format_utc_offset_impl(tm_); }
+  void on_utc_offset(numeric_system ns) { format_utc_offset_impl(tm_, ns); }
   void on_tz_name() { format_tz_name_impl(tm_); }
 
   void on_year(numeric_system ns) {
@@ -1278,24 +1535,26 @@ template <typename OutputIt, typename Char> class tm_writer {
     format_localized('m', 'O');
   }
 
-  void on_dec0_week_of_year(numeric_system ns) {
+  void on_dec0_week_of_year(numeric_system ns, pad_type pad) {
     if (is_classic_ || ns == numeric_system::standard)
-      return write2((tm_yday() + days_per_week - tm_wday()) / days_per_week);
+      return write2((tm_yday() + days_per_week - tm_wday()) / days_per_week,
+                    pad);
     format_localized('U', 'O');
   }
-  void on_dec1_week_of_year(numeric_system ns) {
+  void on_dec1_week_of_year(numeric_system ns, pad_type pad) {
     if (is_classic_ || ns == numeric_system::standard) {
       auto wday = tm_wday();
       write2((tm_yday() + days_per_week -
               (wday == 0 ? (days_per_week - 1) : (wday - 1))) /
-             days_per_week);
+                 days_per_week,
+             pad);
     } else {
       format_localized('W', 'O');
     }
   }
-  void on_iso_week_of_year(numeric_system ns) {
+  void on_iso_week_of_year(numeric_system ns, pad_type pad) {
     if (is_classic_ || ns == numeric_system::standard)
-      return write2(tm_iso_week_of_year());
+      return write2(tm_iso_week_of_year(), pad);
     format_localized('V', 'O');
   }
 
@@ -1309,37 +1568,47 @@ template <typename OutputIt, typename Char> class tm_writer {
     write1(yday / 100);
     write2(yday % 100);
   }
-  void on_day_of_month(numeric_system ns) {
-    if (is_classic_ || ns == numeric_system::standard) return write2(tm_mday());
+  void on_day_of_month(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_mday(), pad);
     format_localized('d', 'O');
   }
-  void on_day_of_month_space(numeric_system ns) {
-    if (is_classic_ || ns == numeric_system::standard) {
-      auto mday = to_unsigned(tm_mday()) % 100;
-      const char* d2 = digits2(mday);
-      *out_++ = mday < 10 ? ' ' : d2[0];
-      *out_++ = d2[1];
-    } else {
-      format_localized('e', 'O');
-    }
-  }
 
-  void on_24_hour(numeric_system ns) {
-    if (is_classic_ || ns == numeric_system::standard) return write2(tm_hour());
+  void on_24_hour(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_hour(), pad);
     format_localized('H', 'O');
   }
-  void on_12_hour(numeric_system ns) {
+  void on_12_hour(numeric_system ns, pad_type pad) {
     if (is_classic_ || ns == numeric_system::standard)
-      return write2(tm_hour12());
+      return write2(tm_hour12(), pad);
     format_localized('I', 'O');
   }
-  void on_minute(numeric_system ns) {
-    if (is_classic_ || ns == numeric_system::standard) return write2(tm_min());
+  void on_minute(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_min(), pad);
     format_localized('M', 'O');
   }
-  void on_second(numeric_system ns) {
-    if (is_classic_ || ns == numeric_system::standard) return write2(tm_sec());
-    format_localized('S', 'O');
+
+  void on_second(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      write2(tm_sec(), pad);
+      if (subsecs_) {
+        if (std::is_floating_point<typename Duration::rep>::value) {
+          auto buf = memory_buffer();
+          write_floating_seconds(buf, *subsecs_);
+          if (buf.size() > 1) {
+            // Remove the leading "0", write something like ".123".
+            out_ = std::copy(buf.begin() + 1, buf.end(), out_);
+          }
+        } else {
+          write_fractional_seconds<Char>(out_, *subsecs_);
+        }
+      }
+    } else {
+      // Currently no formatting of subseconds when a locale is set.
+      format_localized('S', 'O');
+    }
   }
 
   void on_12_hour_time() {
@@ -1347,7 +1616,7 @@ template <typename OutputIt, typename Char> class tm_writer {
       char buf[8];
       write_digit2_separated(buf, to_unsigned(tm_hour12()),
                              to_unsigned(tm_min()), to_unsigned(tm_sec()), ':');
-      out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
+      out_ = copy<Char>(std::begin(buf), std::end(buf), out_);
       *out_++ = ' ';
       on_am_pm();
     } else {
@@ -1360,10 +1629,9 @@ template <typename OutputIt, typename Char> class tm_writer {
     write2(tm_min());
   }
   void on_iso_time() {
-    char buf[8];
-    write_digit2_separated(buf, to_unsigned(tm_hour()), to_unsigned(tm_min()),
-                           to_unsigned(tm_sec()), ':');
-    out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
+    on_24_hour_time();
+    *out_++ = ':';
+    on_second(numeric_system::standard, pad_type::zero);
   }
 
   void on_am_pm() {
@@ -1381,49 +1649,41 @@ template <typename OutputIt, typename Char> class tm_writer {
 };
 
 struct chrono_format_checker : null_chrono_spec_handler<chrono_format_checker> {
+  bool has_precision_integral = false;
+
   FMT_NORETURN void unsupported() { FMT_THROW(format_error("no date")); }
 
   template <typename Char>
   FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
-  FMT_CONSTEXPR void on_24_hour(numeric_system) {}
-  FMT_CONSTEXPR void on_12_hour(numeric_system) {}
-  FMT_CONSTEXPR void on_minute(numeric_system) {}
-  FMT_CONSTEXPR void on_second(numeric_system) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
+  FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_second(numeric_system, pad_type) {}
   FMT_CONSTEXPR void on_12_hour_time() {}
   FMT_CONSTEXPR void on_24_hour_time() {}
   FMT_CONSTEXPR void on_iso_time() {}
   FMT_CONSTEXPR void on_am_pm() {}
-  FMT_CONSTEXPR void on_duration_value() {}
+  FMT_CONSTEXPR void on_duration_value() const {
+    if (has_precision_integral) {
+      FMT_THROW(format_error("precision not allowed for this argument type"));
+    }
+  }
   FMT_CONSTEXPR void on_duration_unit() {}
 };
 
-template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-inline bool isfinite(T) {
+template <typename T,
+          FMT_ENABLE_IF(std::is_integral<T>::value&& has_isfinite<T>::value)>
+inline auto isfinite(T) -> bool {
   return true;
 }
 
-// Converts value to Int and checks that it's in the range [0, upper).
-template <typename T, typename Int, FMT_ENABLE_IF(std::is_integral<T>::value)>
-inline Int to_nonnegative_int(T value, Int upper) {
-  FMT_ASSERT(std::is_unsigned<Int>::value ||
-                 (value >= 0 && to_unsigned(value) <= to_unsigned(upper)),
-             "invalid value");
-  (void)upper;
-  return static_cast<Int>(value);
-}
-template <typename T, typename Int, FMT_ENABLE_IF(!std::is_integral<T>::value)>
-inline Int to_nonnegative_int(T value, Int upper) {
-  if (value < 0 || value > static_cast<T>(upper))
-    FMT_THROW(format_error("invalid value"));
-  return static_cast<Int>(value);
-}
-
 template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-inline T mod(T x, int y) {
+inline auto mod(T x, int y) -> T {
   return x % static_cast<T>(y);
 }
 template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-inline T mod(T x, int y) {
+inline auto mod(T x, int y) -> T {
   return std::fmod(x, static_cast<T>(y));
 }
 
@@ -1438,104 +1698,52 @@ template <typename T> struct make_unsigned_or_unchanged<T, true> {
   using type = typename std::make_unsigned<T>::type;
 };
 
-#if FMT_SAFE_DURATION_CAST
-// throwing version of safe_duration_cast
-template <typename To, typename FromRep, typename FromPeriod>
-To fmt_safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) {
-  int ec;
-  To to = safe_duration_cast::safe_duration_cast<To>(from, ec);
-  if (ec) FMT_THROW(format_error("cannot format duration"));
-  return to;
-}
-#endif
-
 template <typename Rep, typename Period,
           FMT_ENABLE_IF(std::is_integral<Rep>::value)>
-inline std::chrono::duration<Rep, std::milli> get_milliseconds(
-    std::chrono::duration<Rep, Period> d) {
+inline auto get_milliseconds(std::chrono::duration<Rep, Period> d)
+    -> std::chrono::duration<Rep, std::milli> {
   // this may overflow and/or the result may not fit in the
   // target type.
 #if FMT_SAFE_DURATION_CAST
   using CommonSecondsType =
       typename std::common_type<decltype(d), std::chrono::seconds>::type;
-  const auto d_as_common = fmt_safe_duration_cast<CommonSecondsType>(d);
+  const auto d_as_common = fmt_duration_cast<CommonSecondsType>(d);
   const auto d_as_whole_seconds =
-      fmt_safe_duration_cast<std::chrono::seconds>(d_as_common);
+      fmt_duration_cast<std::chrono::seconds>(d_as_common);
   // this conversion should be nonproblematic
   const auto diff = d_as_common - d_as_whole_seconds;
   const auto ms =
-      fmt_safe_duration_cast<std::chrono::duration<Rep, std::milli>>(diff);
+      fmt_duration_cast<std::chrono::duration<Rep, std::milli>>(diff);
   return ms;
 #else
-  auto s = std::chrono::duration_cast<std::chrono::seconds>(d);
-  return std::chrono::duration_cast<std::chrono::milliseconds>(d - s);
+  auto s = fmt_duration_cast<std::chrono::seconds>(d);
+  return fmt_duration_cast<std::chrono::milliseconds>(d - s);
 #endif
 }
 
-// Counts the number of fractional digits in the range [0, 18] according to the
-// C++20 spec. If more than 18 fractional digits are required then returns 6 for
-// microseconds precision.
-template <long long Num, long long Den, int N = 0,
-          bool Enabled = (N < 19) && (Num <= max_value<long long>() / 10)>
-struct count_fractional_digits {
-  static constexpr int value =
-      Num % Den == 0 ? N : count_fractional_digits<Num * 10, Den, N + 1>::value;
-};
-
-// Base case that doesn't instantiate any more templates
-// in order to avoid overflow.
-template <long long Num, long long Den, int N>
-struct count_fractional_digits<Num, Den, N, false> {
-  static constexpr int value = (Num % Den == 0) ? N : 6;
-};
-
-constexpr long long pow10(std::uint32_t n) {
-  return n == 0 ? 1 : 10 * pow10(n - 1);
-}
-
-template <class Rep, class Period,
-          FMT_ENABLE_IF(std::numeric_limits<Rep>::is_signed)>
-constexpr std::chrono::duration<Rep, Period> abs(
-    std::chrono::duration<Rep, Period> d) {
-  // We need to compare the duration using the count() method directly
-  // due to a compiler bug in clang-11 regarding the spaceship operator,
-  // when -Wzero-as-null-pointer-constant is enabled.
-  // In clang-12 the bug has been fixed. See
-  // https://bugs.llvm.org/show_bug.cgi?id=46235 and the reproducible example:
-  // https://www.godbolt.org/z/Knbb5joYx.
-  return d.count() >= d.zero().count() ? d : -d;
-}
-
-template <class Rep, class Period,
-          FMT_ENABLE_IF(!std::numeric_limits<Rep>::is_signed)>
-constexpr std::chrono::duration<Rep, Period> abs(
-    std::chrono::duration<Rep, Period> d) {
-  return d;
-}
-
 template <typename Char, typename Rep, typename OutputIt,
           FMT_ENABLE_IF(std::is_integral<Rep>::value)>
-OutputIt format_duration_value(OutputIt out, Rep val, int) {
+auto format_duration_value(OutputIt out, Rep val, int) -> OutputIt {
   return write<Char>(out, val);
 }
 
 template <typename Char, typename Rep, typename OutputIt,
           FMT_ENABLE_IF(std::is_floating_point<Rep>::value)>
-OutputIt format_duration_value(OutputIt out, Rep val, int precision) {
-  auto specs = basic_format_specs<Char>();
+auto format_duration_value(OutputIt out, Rep val, int precision) -> OutputIt {
+  auto specs = format_specs();
   specs.precision = precision;
-  specs.type = precision >= 0 ? presentation_type::fixed_lower
-                              : presentation_type::general_lower;
+  specs.type =
+      precision >= 0 ? presentation_type::fixed : presentation_type::general;
   return write<Char>(out, val, specs);
 }
 
 template <typename Char, typename OutputIt>
-OutputIt copy_unit(string_view unit, OutputIt out, Char) {
+auto copy_unit(string_view unit, OutputIt out, Char) -> OutputIt {
   return std::copy(unit.begin(), unit.end(), out);
 }
 
 template <typename OutputIt>
-OutputIt copy_unit(string_view unit, OutputIt out, wchar_t) {
+auto copy_unit(string_view unit, OutputIt out, wchar_t) -> OutputIt {
   // This works when wchar_t is UTF-32 because units only contain characters
   // that have the same representation in UTF-16 and UTF-32.
   utf8_to_utf16 u(unit);
@@ -1543,7 +1751,7 @@ OutputIt copy_unit(string_view unit, OutputIt out, wchar_t) {
 }
 
 template <typename Char, typename Period, typename OutputIt>
-OutputIt format_duration_unit(OutputIt out) {
+auto format_duration_unit(OutputIt out) -> OutputIt {
   if (const char* unit = get_units<Period>())
     return copy_unit(string_view(unit), out, Char());
   *out++ = '[';
@@ -1566,8 +1774,10 @@ class get_locale {
 
  public:
   get_locale(bool localized, locale_ref loc) : has_locale_(localized) {
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
     if (localized)
       ::new (&locale_) std::locale(loc.template get<std::locale>());
+#endif
   }
   ~get_locale() {
     if (has_locale_) locale_.~locale();
@@ -1610,18 +1820,12 @@ struct chrono_formatter {
 
     // this may overflow and/or the result may not fit in the
     // target type.
-#if FMT_SAFE_DURATION_CAST
     // might need checked conversion (rep!=Rep)
-    auto tmpval = std::chrono::duration<rep, Period>(val);
-    s = fmt_safe_duration_cast<seconds>(tmpval);
-#else
-    s = std::chrono::duration_cast<seconds>(
-        std::chrono::duration<rep, Period>(val));
-#endif
+    s = fmt_duration_cast<seconds>(std::chrono::duration<rep, Period>(val));
   }
 
   // returns true if nan or inf, writes to out.
-  bool handle_nan_inf() {
+  auto handle_nan_inf() -> bool {
     if (isfinite(val)) {
       return false;
     }
@@ -1638,17 +1842,22 @@ struct chrono_formatter {
     return true;
   }
 
-  Rep hour() const { return static_cast<Rep>(mod((s.count() / 3600), 24)); }
+  auto days() const -> Rep { return static_cast<Rep>(s.count() / 86400); }
+  auto hour() const -> Rep {
+    return static_cast<Rep>(mod((s.count() / 3600), 24));
+  }
 
-  Rep hour12() const {
+  auto hour12() const -> Rep {
     Rep hour = static_cast<Rep>(mod((s.count() / 3600), 12));
     return hour <= 0 ? 12 : hour;
   }
 
-  Rep minute() const { return static_cast<Rep>(mod((s.count() / 60), 60)); }
-  Rep second() const { return static_cast<Rep>(mod(s.count(), 60)); }
+  auto minute() const -> Rep {
+    return static_cast<Rep>(mod((s.count() / 60), 60));
+  }
+  auto second() const -> Rep { return static_cast<Rep>(mod(s.count(), 60)); }
 
-  std::tm time() const {
+  auto time() const -> std::tm {
     auto time = std::tm();
     time.tm_hour = to_nonnegative_int(hour(), 24);
     time.tm_min = to_nonnegative_int(minute(), 60);
@@ -1663,44 +1872,16 @@ struct chrono_formatter {
     }
   }
 
-  void write(Rep value, int width) {
+  void write(Rep value, int width, pad_type pad = pad_type::zero) {
     write_sign();
     if (isnan(value)) return write_nan();
     uint32_or_64_or_128_t<int> n =
         to_unsigned(to_nonnegative_int(value, max_value<int>()));
     int num_digits = detail::count_digits(n);
-    if (width > num_digits) out = std::fill_n(out, width - num_digits, '0');
-    out = format_decimal<char_type>(out, n, num_digits).end;
-  }
-
-  template <typename Duration> void write_fractional_seconds(Duration d) {
-    FMT_ASSERT(!std::is_floating_point<typename Duration::rep>::value, "");
-    constexpr auto num_fractional_digits =
-        count_fractional_digits<Duration::period::num,
-                                Duration::period::den>::value;
-
-    using subsecond_precision = std::chrono::duration<
-        typename std::common_type<typename Duration::rep,
-                                  std::chrono::seconds::rep>::type,
-        std::ratio<1, detail::pow10(num_fractional_digits)>>;
-    if (std::ratio_less<typename subsecond_precision::period,
-                        std::chrono::seconds::period>::value) {
-      *out++ = '.';
-      auto fractional =
-          detail::abs(d) - std::chrono::duration_cast<std::chrono::seconds>(d);
-      auto subseconds =
-          std::chrono::treat_as_floating_point<
-              typename subsecond_precision::rep>::value
-              ? fractional.count()
-              : std::chrono::duration_cast<subsecond_precision>(fractional)
-                    .count();
-      uint32_or_64_or_128_t<long long> n =
-          to_unsigned(to_nonnegative_int(subseconds, max_value<long long>()));
-      int num_digits = detail::count_digits(n);
-      if (num_fractional_digits > num_digits)
-        out = std::fill_n(out, num_fractional_digits - num_digits, '0');
-      out = format_decimal<char_type>(out, n, num_digits).end;
+    if (width > num_digits) {
+      out = detail::write_padding(out, pad, width - num_digits);
     }
+    out = format_decimal<char_type>(out, n, num_digits).end;
   }
 
   void write_nan() { std::copy_n("nan", 3, out); }
@@ -1732,7 +1913,7 @@ struct chrono_formatter {
   void on_loc_time(numeric_system) {}
   void on_us_date() {}
   void on_iso_date() {}
-  void on_utc_offset() {}
+  void on_utc_offset(numeric_system) {}
   void on_tz_name() {}
   void on_year(numeric_system) {}
   void on_short_year(numeric_system) {}
@@ -1741,65 +1922,66 @@ struct chrono_formatter {
   void on_iso_week_based_year() {}
   void on_iso_week_based_short_year() {}
   void on_dec_month(numeric_system) {}
-  void on_dec0_week_of_year(numeric_system) {}
-  void on_dec1_week_of_year(numeric_system) {}
-  void on_iso_week_of_year(numeric_system) {}
-  void on_day_of_year() {}
-  void on_day_of_month(numeric_system) {}
-  void on_day_of_month_space(numeric_system) {}
-
-  void on_24_hour(numeric_system ns) {
+  void on_dec0_week_of_year(numeric_system, pad_type) {}
+  void on_dec1_week_of_year(numeric_system, pad_type) {}
+  void on_iso_week_of_year(numeric_system, pad_type) {}
+  void on_day_of_month(numeric_system, pad_type) {}
+
+  void on_day_of_year() {
     if (handle_nan_inf()) return;
+    write(days(), 0);
+  }
 
-    if (ns == numeric_system::standard) return write(hour(), 2);
+  void on_24_hour(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) return write(hour(), 2, pad);
     auto time = tm();
     time.tm_hour = to_nonnegative_int(hour(), 24);
-    format_tm(time, &tm_writer_type::on_24_hour, ns);
+    format_tm(time, &tm_writer_type::on_24_hour, ns, pad);
   }
 
-  void on_12_hour(numeric_system ns) {
+  void on_12_hour(numeric_system ns, pad_type pad) {
     if (handle_nan_inf()) return;
 
-    if (ns == numeric_system::standard) return write(hour12(), 2);
+    if (ns == numeric_system::standard) return write(hour12(), 2, pad);
     auto time = tm();
     time.tm_hour = to_nonnegative_int(hour12(), 12);
-    format_tm(time, &tm_writer_type::on_12_hour, ns);
+    format_tm(time, &tm_writer_type::on_12_hour, ns, pad);
   }
 
-  void on_minute(numeric_system ns) {
+  void on_minute(numeric_system ns, pad_type pad) {
     if (handle_nan_inf()) return;
 
-    if (ns == numeric_system::standard) return write(minute(), 2);
+    if (ns == numeric_system::standard) return write(minute(), 2, pad);
     auto time = tm();
     time.tm_min = to_nonnegative_int(minute(), 60);
-    format_tm(time, &tm_writer_type::on_minute, ns);
+    format_tm(time, &tm_writer_type::on_minute, ns, pad);
   }
 
-  void on_second(numeric_system ns) {
+  void on_second(numeric_system ns, pad_type pad) {
     if (handle_nan_inf()) return;
 
     if (ns == numeric_system::standard) {
       if (std::is_floating_point<rep>::value) {
-        constexpr auto num_fractional_digits =
-            count_fractional_digits<Period::num, Period::den>::value;
         auto buf = memory_buffer();
-        format_to(std::back_inserter(buf), runtime("{:.{}f}"),
-                  std::fmod(val * static_cast<rep>(Period::num) /
-                                static_cast<rep>(Period::den),
-                            static_cast<rep>(60)),
-                  num_fractional_digits);
+        write_floating_seconds(buf, std::chrono::duration<rep, Period>(val),
+                               precision);
         if (negative) *out++ = '-';
-        if (buf.size() < 2 || buf[1] == '.') *out++ = '0';
+        if (buf.size() < 2 || buf[1] == '.') {
+          out = detail::write_padding(out, pad);
+        }
         out = std::copy(buf.begin(), buf.end(), out);
       } else {
-        write(second(), 2);
-        write_fractional_seconds(std::chrono::duration<rep, Period>(val));
+        write(second(), 2, pad);
+        write_fractional_seconds<char_type>(
+            out, std::chrono::duration<rep, Period>(val), precision);
       }
       return;
     }
     auto time = tm();
     time.tm_sec = to_nonnegative_int(second(), 60);
-    format_tm(time, &tm_writer_type::on_second, ns);
+    format_tm(time, &tm_writer_type::on_second, ns, pad);
   }
 
   void on_12_hour_time() {
@@ -1823,7 +2005,7 @@ struct chrono_formatter {
     on_24_hour_time();
     *out++ = ':';
     if (handle_nan_inf()) return;
-    on_second(numeric_system::standard);
+    on_second(numeric_system::standard, pad_type::zero);
   }
 
   void on_am_pm() {
@@ -1842,168 +2024,279 @@ struct chrono_formatter {
   }
 };
 
-FMT_END_DETAIL_NAMESPACE
+}  // namespace detail
 
 #if defined(__cpp_lib_chrono) && __cpp_lib_chrono >= 201907
 using weekday = std::chrono::weekday;
+using day = std::chrono::day;
+using month = std::chrono::month;
+using year = std::chrono::year;
+using year_month_day = std::chrono::year_month_day;
 #else
 // A fallback version of weekday.
 class weekday {
  private:
-  unsigned char value;
+  unsigned char value_;
 
  public:
   weekday() = default;
-  explicit constexpr weekday(unsigned wd) noexcept
-      : value(static_cast<unsigned char>(wd != 7 ? wd : 0)) {}
-  constexpr unsigned c_encoding() const noexcept { return value; }
+  constexpr explicit weekday(unsigned wd) noexcept
+      : value_(static_cast<unsigned char>(wd != 7 ? wd : 0)) {}
+  constexpr auto c_encoding() const noexcept -> unsigned { return value_; }
 };
 
-class year_month_day {};
+class day {
+ private:
+  unsigned char value_;
+
+ public:
+  day() = default;
+  constexpr explicit day(unsigned d) noexcept
+      : value_(static_cast<unsigned char>(d)) {}
+  constexpr explicit operator unsigned() const noexcept { return value_; }
+};
+
+class month {
+ private:
+  unsigned char value_;
+
+ public:
+  month() = default;
+  constexpr explicit month(unsigned m) noexcept
+      : value_(static_cast<unsigned char>(m)) {}
+  constexpr explicit operator unsigned() const noexcept { return value_; }
+};
+
+class year {
+ private:
+  int value_;
+
+ public:
+  year() = default;
+  constexpr explicit year(int y) noexcept : value_(y) {}
+  constexpr explicit operator int() const noexcept { return value_; }
+};
+
+class year_month_day {
+ private:
+  fmt::year year_;
+  fmt::month month_;
+  fmt::day day_;
+
+ public:
+  year_month_day() = default;
+  constexpr year_month_day(const year& y, const month& m, const day& d) noexcept
+      : year_(y), month_(m), day_(d) {}
+  constexpr auto year() const noexcept -> fmt::year { return year_; }
+  constexpr auto month() const noexcept -> fmt::month { return month_; }
+  constexpr auto day() const noexcept -> fmt::day { return day_; }
+};
 #endif
 
-// A rudimentary weekday formatter.
-template <typename Char> struct formatter<weekday, Char> {
+template <typename Char>
+struct formatter<weekday, Char> : private formatter<std::tm, Char> {
  private:
-  bool localized = false;
+  bool localized_ = false;
+  bool use_tm_formatter_ = false;
 
  public:
   FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
       -> decltype(ctx.begin()) {
-    auto begin = ctx.begin(), end = ctx.end();
-    if (begin != end && *begin == 'L') {
-      ++begin;
-      localized = true;
+    auto it = ctx.begin(), end = ctx.end();
+    if (it != end && *it == 'L') {
+      ++it;
+      localized_ = true;
+      return it;
     }
-    return begin;
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
   }
 
   template <typename FormatContext>
   auto format(weekday wd, FormatContext& ctx) const -> decltype(ctx.out()) {
     auto time = std::tm();
     time.tm_wday = static_cast<int>(wd.c_encoding());
-    detail::get_locale loc(localized, ctx.locale());
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(localized_, ctx.locale());
     auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
     w.on_abbr_weekday();
     return w.out();
   }
 };
 
-template <typename Rep, typename Period, typename Char>
-struct formatter<std::chrono::duration<Rep, Period>, Char> {
+template <typename Char>
+struct formatter<day, Char> : private formatter<std::tm, Char> {
  private:
-  basic_format_specs<Char> specs;
-  int precision = -1;
-  using arg_ref_type = detail::arg_ref<Char>;
-  arg_ref_type width_ref;
-  arg_ref_type precision_ref;
-  bool localized = false;
-  basic_string_view<Char> format_str;
-  using duration = std::chrono::duration<Rep, Period>;
+  bool use_tm_formatter_ = false;
 
-  struct spec_handler {
-    formatter& f;
-    basic_format_parse_context<Char>& context;
-    basic_string_view<Char> format_str;
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+  }
 
-    template <typename Id> FMT_CONSTEXPR arg_ref_type make_arg_ref(Id arg_id) {
-      context.check_arg_id(arg_id);
-      return arg_ref_type(arg_id);
-    }
+  template <typename FormatContext>
+  auto format(day d, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_mday = static_cast<int>(static_cast<unsigned>(d));
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(false, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_day_of_month(detail::numeric_system::standard, detail::pad_type::zero);
+    return w.out();
+  }
+};
 
-    FMT_CONSTEXPR arg_ref_type make_arg_ref(basic_string_view<Char> arg_id) {
-      context.check_arg_id(arg_id);
-      return arg_ref_type(arg_id);
-    }
+template <typename Char>
+struct formatter<month, Char> : private formatter<std::tm, Char> {
+ private:
+  bool localized_ = false;
+  bool use_tm_formatter_ = false;
 
-    FMT_CONSTEXPR arg_ref_type make_arg_ref(detail::auto_id) {
-      return arg_ref_type(context.next_arg_id());
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it != end && *it == 'L') {
+      ++it;
+      localized_ = true;
+      return it;
     }
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+  }
 
-    void on_error(const char* msg) { FMT_THROW(format_error(msg)); }
-    FMT_CONSTEXPR void on_fill(basic_string_view<Char> fill) {
-      f.specs.fill = fill;
-    }
-    FMT_CONSTEXPR void on_align(align_t align) { f.specs.align = align; }
-    FMT_CONSTEXPR void on_width(int width) { f.specs.width = width; }
-    FMT_CONSTEXPR void on_precision(int _precision) {
-      f.precision = _precision;
-    }
-    FMT_CONSTEXPR void end_precision() {}
+  template <typename FormatContext>
+  auto format(month m, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_mon = static_cast<int>(static_cast<unsigned>(m)) - 1;
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(localized_, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_abbr_month();
+    return w.out();
+  }
+};
 
-    template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
-      f.width_ref = make_arg_ref(arg_id);
-    }
+template <typename Char>
+struct formatter<year, Char> : private formatter<std::tm, Char> {
+ private:
+  bool use_tm_formatter_ = false;
 
-    template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
-      f.precision_ref = make_arg_ref(arg_id);
-    }
-  };
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
+  }
 
-  using iterator = typename basic_format_parse_context<Char>::iterator;
-  struct parse_range {
-    iterator begin;
-    iterator end;
-  };
+  template <typename FormatContext>
+  auto format(year y, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_year = static_cast<int>(y) - 1900;
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(false, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_year(detail::numeric_system::standard);
+    return w.out();
+  }
+};
 
-  FMT_CONSTEXPR parse_range do_parse(basic_format_parse_context<Char>& ctx) {
-    auto begin = ctx.begin(), end = ctx.end();
-    if (begin == end || *begin == '}') return {begin, begin};
-    spec_handler handler{*this, ctx, format_str};
-    begin = detail::parse_align(begin, end, handler);
-    if (begin == end) return {begin, begin};
-    begin = detail::parse_width(begin, end, handler);
-    if (begin == end) return {begin, begin};
-    if (*begin == '.') {
-      if (std::is_floating_point<Rep>::value)
-        begin = detail::parse_precision(begin, end, handler);
-      else
-        handler.on_error("precision not allowed for this argument type");
-    }
-    if (begin != end && *begin == 'L') {
-      ++begin;
-      localized = true;
-    }
-    end = detail::parse_chrono_format(begin, end,
-                                      detail::chrono_format_checker());
-    return {begin, end};
+template <typename Char>
+struct formatter<year_month_day, Char> : private formatter<std::tm, Char> {
+ private:
+  bool use_tm_formatter_ = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    use_tm_formatter_ = it != end && *it != '}';
+    return use_tm_formatter_ ? formatter<std::tm, Char>::parse(ctx) : it;
   }
 
+  template <typename FormatContext>
+  auto format(year_month_day val, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_year = static_cast<int>(val.year()) - 1900;
+    time.tm_mon = static_cast<int>(static_cast<unsigned>(val.month())) - 1;
+    time.tm_mday = static_cast<int>(static_cast<unsigned>(val.day()));
+    if (use_tm_formatter_) return formatter<std::tm, Char>::format(time, ctx);
+    detail::get_locale loc(true, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_iso_date();
+    return w.out();
+  }
+};
+
+template <typename Rep, typename Period, typename Char>
+struct formatter<std::chrono::duration<Rep, Period>, Char> {
+ private:
+  format_specs specs_;
+  detail::arg_ref<Char> width_ref_;
+  detail::arg_ref<Char> precision_ref_;
+  bool localized_ = false;
+  basic_string_view<Char> format_str_;
+
  public:
   FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
       -> decltype(ctx.begin()) {
-    auto range = do_parse(ctx);
-    format_str = basic_string_view<Char>(
-        &*range.begin, detail::to_unsigned(range.end - range.begin));
-    return range.end;
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end || *it == '}') return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it == end) return it;
+
+    auto checker = detail::chrono_format_checker();
+    if (*it == '.') {
+      checker.has_precision_integral = !std::is_floating_point<Rep>::value;
+      it = detail::parse_precision(it, end, specs_.precision, precision_ref_,
+                                   ctx);
+    }
+    if (it != end && *it == 'L') {
+      localized_ = true;
+      ++it;
+    }
+    end = detail::parse_chrono_format(it, end, checker);
+    format_str_ = {it, detail::to_unsigned(end - it)};
+    return end;
   }
 
   template <typename FormatContext>
-  auto format(const duration& d, FormatContext& ctx) const
+  auto format(std::chrono::duration<Rep, Period> d, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    auto specs_copy = specs;
-    auto precision_copy = precision;
-    auto begin = format_str.begin(), end = format_str.end();
+    auto specs = specs_;
+    auto precision = specs.precision;
+    specs.precision = -1;
+    auto begin = format_str_.begin(), end = format_str_.end();
     // As a possible future optimization, we could avoid extra copying if width
     // is not specified.
-    basic_memory_buffer<Char> buf;
+    auto buf = basic_memory_buffer<Char>();
     auto out = std::back_inserter(buf);
-    detail::handle_dynamic_spec<detail::width_checker>(specs_copy.width,
-                                                       width_ref, ctx);
-    detail::handle_dynamic_spec<detail::precision_checker>(precision_copy,
-                                                           precision_ref, ctx);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(precision,
+                                                           precision_ref_, ctx);
     if (begin == end || *begin == '}') {
-      out = detail::format_duration_value<Char>(out, d.count(), precision_copy);
+      out = detail::format_duration_value<Char>(out, d.count(), precision);
       detail::format_duration_unit<Char, Period>(out);
     } else {
-      detail::chrono_formatter<FormatContext, decltype(out), Rep, Period> f(
-          ctx, out, d);
-      f.precision = precision_copy;
-      f.localized = localized;
+      using chrono_formatter =
+          detail::chrono_formatter<FormatContext, decltype(out), Rep, Period>;
+      auto f = chrono_formatter(ctx, out, d);
+      f.precision = precision;
+      f.localized = localized_;
       detail::parse_chrono_format(begin, end, f);
     }
     return detail::write(
-        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs_copy);
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
   }
 };
 
@@ -2011,87 +2304,129 @@ template <typename Char, typename Duration>
 struct formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
                  Char> : formatter<std::tm, Char> {
   FMT_CONSTEXPR formatter() {
-    basic_string_view<Char> default_specs =
-        detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
-    this->do_parse(default_specs.begin(), default_specs.end());
+    this->format_str_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
   }
 
   template <typename FormatContext>
-  auto format(std::chrono::time_point<std::chrono::system_clock> val,
+  auto format(std::chrono::time_point<std::chrono::system_clock, Duration> val,
               FormatContext& ctx) const -> decltype(ctx.out()) {
-    return formatter<std::tm, Char>::format(localtime(val), ctx);
+    std::tm tm = gmtime(val);
+    using period = typename Duration::period;
+    if (detail::const_check(
+            period::num == 1 && period::den == 1 &&
+            !std::is_floating_point<typename Duration::rep>::value)) {
+      return formatter<std::tm, Char>::format(tm, ctx);
+    }
+    Duration epoch = val.time_since_epoch();
+    Duration subsecs = detail::fmt_duration_cast<Duration>(
+        epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
+    if (subsecs.count() < 0) {
+      auto second =
+          detail::fmt_duration_cast<Duration>(std::chrono::seconds(1));
+      if (tm.tm_sec != 0)
+        --tm.tm_sec;
+      else
+        tm = gmtime(val - second);
+      subsecs += detail::fmt_duration_cast<Duration>(std::chrono::seconds(1));
+    }
+    return formatter<std::tm, Char>::do_format(tm, ctx, &subsecs);
   }
 };
 
-#if FMT_USE_UTC_TIME
+#if FMT_USE_LOCAL_TIME
 template <typename Char, typename Duration>
-struct formatter<std::chrono::time_point<std::chrono::utc_clock, Duration>,
-                 Char> : formatter<std::tm, Char> {
+struct formatter<std::chrono::local_time<Duration>, Char>
+    : formatter<std::tm, Char> {
   FMT_CONSTEXPR formatter() {
-    basic_string_view<Char> default_specs =
-        detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
-    this->do_parse(default_specs.begin(), default_specs.end());
+    this->format_str_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
   }
 
   template <typename FormatContext>
-  auto format(std::chrono::time_point<std::chrono::utc_clock> val,
+  auto format(std::chrono::local_time<Duration> val, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    using period = typename Duration::period;
+    if (period::num != 1 || period::den != 1 ||
+        std::is_floating_point<typename Duration::rep>::value) {
+      const auto epoch = val.time_since_epoch();
+      const auto subsecs = detail::fmt_duration_cast<Duration>(
+          epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
+
+      return formatter<std::tm, Char>::do_format(localtime(val), ctx, &subsecs);
+    }
+
+    return formatter<std::tm, Char>::format(localtime(val), ctx);
+  }
+};
+#endif
+
+#if FMT_USE_UTC_TIME
+template <typename Char, typename Duration>
+struct formatter<std::chrono::time_point<std::chrono::utc_clock, Duration>,
+                 Char>
+    : formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+                Char> {
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::utc_clock, Duration> val,
               FormatContext& ctx) const -> decltype(ctx.out()) {
-    return formatter<std::tm, Char>::format(
-        localtime(std::chrono::utc_clock::to_sys(val)), ctx);
+    return formatter<
+        std::chrono::time_point<std::chrono::system_clock, Duration>,
+        Char>::format(std::chrono::utc_clock::to_sys(val), ctx);
   }
 };
 #endif
 
 template <typename Char> struct formatter<std::tm, Char> {
  private:
-  enum class spec {
-    unknown,
-    year_month_day,
-    hh_mm_ss,
-  };
-  spec spec_ = spec::unknown;
-  basic_string_view<Char> specs;
+  format_specs specs_;
+  detail::arg_ref<Char> width_ref_;
 
  protected:
-  template <typename It> FMT_CONSTEXPR auto do_parse(It begin, It end) -> It {
-    if (begin != end && *begin == ':') ++begin;
-    end = detail::parse_chrono_format(begin, end, detail::tm_format_checker());
-    // Replace default spec only if the new spec is not empty.
-    if (end != begin) specs = {begin, detail::to_unsigned(end - begin)};
-    return end;
+  basic_string_view<Char> format_str_;
+
+  template <typename FormatContext, typename Duration>
+  auto do_format(const std::tm& tm, FormatContext& ctx,
+                 const Duration* subsecs) const -> decltype(ctx.out()) {
+    auto specs = specs_;
+    auto buf = basic_memory_buffer<Char>();
+    auto out = std::back_inserter(buf);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+
+    auto loc_ref = ctx.locale();
+    detail::get_locale loc(static_cast<bool>(loc_ref), loc_ref);
+    auto w =
+        detail::tm_writer<decltype(out), Char, Duration>(loc, out, tm, subsecs);
+    detail::parse_chrono_format(format_str_.begin(), format_str_.end(), w);
+    return detail::write(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
   }
 
  public:
   FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
       -> decltype(ctx.begin()) {
-    auto end = this->do_parse(ctx.begin(), ctx.end());
-    // basic_string_view<>::compare isn't constexpr before C++17.
-    if (specs.size() == 2 && specs[0] == Char('%')) {
-      if (specs[1] == Char('F'))
-        spec_ = spec::year_month_day;
-      else if (specs[1] == Char('T'))
-        spec_ = spec::hh_mm_ss;
-    }
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end || *it == '}') return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it == end) return it;
+
+    end = detail::parse_chrono_format(it, end, detail::tm_format_checker());
+    // Replace the default format_str only if the new spec is not empty.
+    if (end != it) format_str_ = {it, detail::to_unsigned(end - it)};
     return end;
   }
 
   template <typename FormatContext>
   auto format(const std::tm& tm, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    const auto loc_ref = ctx.locale();
-    detail::get_locale loc(static_cast<bool>(loc_ref), loc_ref);
-    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), tm);
-    if (spec_ == spec::year_month_day)
-      w.on_iso_date();
-    else if (spec_ == spec::hh_mm_ss)
-      w.on_iso_time();
-    else
-      detail::parse_chrono_format(specs.begin(), specs.end(), w);
-    return w.out();
+    return do_format<FormatContext, std::chrono::seconds>(tm, ctx, nullptr);
   }
 };
 
-FMT_MODULE_EXPORT_END
+FMT_END_EXPORT
 FMT_END_NAMESPACE
 
 #endif  // FMT_CHRONO_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/color.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/color.h
index e9b880ad431c..f0e9dd94ef3a 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/color.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/color.h
@@ -11,7 +11,7 @@
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
-FMT_MODULE_EXPORT_BEGIN
+FMT_BEGIN_EXPORT
 
 enum class color : uint32_t {
   alice_blue = 0xF0F8FF,               // rgb(240,248,255)
@@ -203,7 +203,7 @@ struct rgb {
   uint8_t b;
 };
 
-FMT_BEGIN_DETAIL_NAMESPACE
+namespace detail {
 
 // color is a struct of either a rgb color or a terminal color.
 struct color_type {
@@ -225,22 +225,21 @@ struct color_type {
     uint32_t rgb_color;
   } value;
 };
+}  // namespace detail
 
-FMT_END_DETAIL_NAMESPACE
-
-/** A text style consisting of foreground and background colors and emphasis. */
+/// A text style consisting of foreground and background colors and emphasis.
 class text_style {
  public:
   FMT_CONSTEXPR text_style(emphasis em = emphasis()) noexcept
       : set_foreground_color(), set_background_color(), ems(em) {}
 
-  FMT_CONSTEXPR text_style& operator|=(const text_style& rhs) {
+  FMT_CONSTEXPR auto operator|=(const text_style& rhs) -> text_style& {
     if (!set_foreground_color) {
       set_foreground_color = rhs.set_foreground_color;
       foreground_color = rhs.foreground_color;
     } else if (rhs.set_foreground_color) {
       if (!foreground_color.is_rgb || !rhs.foreground_color.is_rgb)
-        FMT_THROW(format_error("can't OR a terminal color"));
+        report_error("can't OR a terminal color");
       foreground_color.value.rgb_color |= rhs.foreground_color.value.rgb_color;
     }
 
@@ -249,7 +248,7 @@ class text_style {
       background_color = rhs.background_color;
     } else if (rhs.set_background_color) {
       if (!background_color.is_rgb || !rhs.background_color.is_rgb)
-        FMT_THROW(format_error("can't OR a terminal color"));
+        report_error("can't OR a terminal color");
       background_color.value.rgb_color |= rhs.background_color.value.rgb_color;
     }
 
@@ -258,29 +257,29 @@ class text_style {
     return *this;
   }
 
-  friend FMT_CONSTEXPR text_style operator|(text_style lhs,
-                                            const text_style& rhs) {
+  friend FMT_CONSTEXPR auto operator|(text_style lhs, const text_style& rhs)
+      -> text_style {
     return lhs |= rhs;
   }
 
-  FMT_CONSTEXPR bool has_foreground() const noexcept {
+  FMT_CONSTEXPR auto has_foreground() const noexcept -> bool {
     return set_foreground_color;
   }
-  FMT_CONSTEXPR bool has_background() const noexcept {
+  FMT_CONSTEXPR auto has_background() const noexcept -> bool {
     return set_background_color;
   }
-  FMT_CONSTEXPR bool has_emphasis() const noexcept {
+  FMT_CONSTEXPR auto has_emphasis() const noexcept -> bool {
     return static_cast<uint8_t>(ems) != 0;
   }
-  FMT_CONSTEXPR detail::color_type get_foreground() const noexcept {
+  FMT_CONSTEXPR auto get_foreground() const noexcept -> detail::color_type {
     FMT_ASSERT(has_foreground(), "no foreground specified for this style");
     return foreground_color;
   }
-  FMT_CONSTEXPR detail::color_type get_background() const noexcept {
+  FMT_CONSTEXPR auto get_background() const noexcept -> detail::color_type {
     FMT_ASSERT(has_background(), "no background specified for this style");
     return background_color;
   }
-  FMT_CONSTEXPR emphasis get_emphasis() const noexcept {
+  FMT_CONSTEXPR auto get_emphasis() const noexcept -> emphasis {
     FMT_ASSERT(has_emphasis(), "no emphasis specified for this style");
     return ems;
   }
@@ -298,9 +297,11 @@ class text_style {
     }
   }
 
-  friend FMT_CONSTEXPR text_style fg(detail::color_type foreground) noexcept;
+  friend FMT_CONSTEXPR auto fg(detail::color_type foreground) noexcept
+      -> text_style;
 
-  friend FMT_CONSTEXPR text_style bg(detail::color_type background) noexcept;
+  friend FMT_CONSTEXPR auto bg(detail::color_type background) noexcept
+      -> text_style;
 
   detail::color_type foreground_color;
   detail::color_type background_color;
@@ -309,21 +310,24 @@ class text_style {
   emphasis ems;
 };
 
-/** Creates a text style from the foreground (text) color. */
-FMT_CONSTEXPR inline text_style fg(detail::color_type foreground) noexcept {
+/// Creates a text style from the foreground (text) color.
+FMT_CONSTEXPR inline auto fg(detail::color_type foreground) noexcept
+    -> text_style {
   return text_style(true, foreground);
 }
 
-/** Creates a text style from the background color. */
-FMT_CONSTEXPR inline text_style bg(detail::color_type background) noexcept {
+/// Creates a text style from the background color.
+FMT_CONSTEXPR inline auto bg(detail::color_type background) noexcept
+    -> text_style {
   return text_style(false, background);
 }
 
-FMT_CONSTEXPR inline text_style operator|(emphasis lhs, emphasis rhs) noexcept {
+FMT_CONSTEXPR inline auto operator|(emphasis lhs, emphasis rhs) noexcept
+    -> text_style {
   return text_style(lhs) | rhs;
 }
 
-FMT_BEGIN_DETAIL_NAMESPACE
+namespace detail {
 
 template <typename Char> struct ansi_color_escape {
   FMT_CONSTEXPR ansi_color_escape(detail::color_type text_color,
@@ -385,9 +389,9 @@ template <typename Char> struct ansi_color_escape {
   }
   FMT_CONSTEXPR operator const Char*() const noexcept { return buffer; }
 
-  FMT_CONSTEXPR const Char* begin() const noexcept { return buffer; }
-  FMT_CONSTEXPR_CHAR_TRAITS const Char* end() const noexcept {
-    return buffer + std::char_traits<Char>::length(buffer);
+  FMT_CONSTEXPR auto begin() const noexcept -> const Char* { return buffer; }
+  FMT_CONSTEXPR20 auto end() const noexcept -> const Char* {
+    return buffer + basic_string_view<Char>(buffer).size();
   }
 
  private:
@@ -401,66 +405,45 @@ template <typename Char> struct ansi_color_escape {
     out[2] = static_cast<Char>('0' + c % 10);
     out[3] = static_cast<Char>(delimiter);
   }
-  static FMT_CONSTEXPR bool has_emphasis(emphasis em, emphasis mask) noexcept {
+  static FMT_CONSTEXPR auto has_emphasis(emphasis em, emphasis mask) noexcept
+      -> bool {
     return static_cast<uint8_t>(em) & static_cast<uint8_t>(mask);
   }
 };
 
 template <typename Char>
-FMT_CONSTEXPR ansi_color_escape<Char> make_foreground_color(
-    detail::color_type foreground) noexcept {
+FMT_CONSTEXPR auto make_foreground_color(detail::color_type foreground) noexcept
+    -> ansi_color_escape<Char> {
   return ansi_color_escape<Char>(foreground, "\x1b[38;2;");
 }
 
 template <typename Char>
-FMT_CONSTEXPR ansi_color_escape<Char> make_background_color(
-    detail::color_type background) noexcept {
+FMT_CONSTEXPR auto make_background_color(detail::color_type background) noexcept
+    -> ansi_color_escape<Char> {
   return ansi_color_escape<Char>(background, "\x1b[48;2;");
 }
 
 template <typename Char>
-FMT_CONSTEXPR ansi_color_escape<Char> make_emphasis(emphasis em) noexcept {
+FMT_CONSTEXPR auto make_emphasis(emphasis em) noexcept
+    -> ansi_color_escape<Char> {
   return ansi_color_escape<Char>(em);
 }
 
-template <typename Char> inline void fputs(const Char* chars, FILE* stream) {
-  int result = std::fputs(chars, stream);
-#if !__NVCC__
-  if (result < 0)
-    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
-#endif
-}
-
-template <> inline void fputs<wchar_t>(const wchar_t* chars, FILE* stream) {
-  int result = std::fputws(chars, stream);
-#if !__NVCC__
-  if (result < 0)
-    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
-#endif
-}
-
-template <typename Char> inline void reset_color(FILE* stream) {
-  fputs("\x1b[0m", stream);
-}
-
-template <> inline void reset_color<wchar_t>(FILE* stream) {
-  fputs(L"\x1b[0m", stream);
-}
-
 template <typename Char> inline void reset_color(buffer<Char>& buffer) {
   auto reset_color = string_view("\x1b[0m");
   buffer.append(reset_color.begin(), reset_color.end());
 }
 
-template <typename T> struct styled_arg {
+template <typename T> struct styled_arg : detail::view {
   const T& value;
   text_style style;
+  styled_arg(const T& v, text_style s) : value(v), style(s) {}
 };
 
 template <typename Char>
-void vformat_to(buffer<Char>& buf, const text_style& ts,
-                basic_string_view<Char> format_str,
-                basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+void vformat_to(
+    buffer<Char>& buf, const text_style& ts, basic_string_view<Char> format_str,
+    basic_format_args<buffered_context<type_identity_t<Char>>> args) {
   bool has_style = false;
   if (ts.has_emphasis()) {
     has_style = true;
@@ -481,118 +464,94 @@ void vformat_to(buffer<Char>& buf, const text_style& ts,
   if (has_style) detail::reset_color<Char>(buf);
 }
 
-FMT_END_DETAIL_NAMESPACE
-
-template <typename S, typename Char = char_t<S>>
-void vprint(std::FILE* f, const text_style& ts, const S& format,
-            basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  basic_memory_buffer<Char> buf;
-  detail::vformat_to(buf, ts, detail::to_string_view(format), args);
-  if (detail::is_utf8()) {
-    detail::print(f, basic_string_view<Char>(buf.begin(), buf.size()));
-  } else {
-    buf.push_back(Char(0));
-    detail::fputs(buf.data(), f);
-  }
+}  // namespace detail
+
+inline void vprint(FILE* f, const text_style& ts, string_view fmt,
+                   format_args args) {
+  auto buf = memory_buffer();
+  detail::vformat_to(buf, ts, fmt, args);
+  print(f, FMT_STRING("{}"), string_view(buf.begin(), buf.size()));
 }
 
 /**
-  \rst
-  Formats a string and prints it to the specified file stream using ANSI
-  escape sequences to specify text formatting.
-
-  **Example**::
-
-    fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
-               "Elapsed time: {0:.2f} seconds", 1.23);
-  \endrst
+ * Formats a string and prints it to the specified file stream using ANSI
+ * escape sequences to specify text formatting.
+ *
+ * **Example**:
+ *
+ *     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
+ *                "Elapsed time: {0:.2f} seconds", 1.23);
  */
-template <typename S, typename... Args,
-          FMT_ENABLE_IF(detail::is_string<S>::value)>
-void print(std::FILE* f, const text_style& ts, const S& format_str,
-           const Args&... args) {
-  vprint(f, ts, format_str,
-         fmt::make_format_args<buffer_context<char_t<S>>>(args...));
+template <typename... T>
+void print(FILE* f, const text_style& ts, format_string<T...> fmt,
+           T&&... args) {
+  vprint(f, ts, fmt, fmt::make_format_args(args...));
 }
 
 /**
-  \rst
-  Formats a string and prints it to stdout using ANSI escape sequences to
-  specify text formatting.
-
-  **Example**::
-
-    fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
-               "Elapsed time: {0:.2f} seconds", 1.23);
-  \endrst
+ * Formats a string and prints it to stdout using ANSI escape sequences to
+ * specify text formatting.
+ *
+ * **Example**:
+ *
+ *     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
+ *                "Elapsed time: {0:.2f} seconds", 1.23);
  */
-template <typename S, typename... Args,
-          FMT_ENABLE_IF(detail::is_string<S>::value)>
-void print(const text_style& ts, const S& format_str, const Args&... args) {
-  return print(stdout, ts, format_str, args...);
+template <typename... T>
+void print(const text_style& ts, format_string<T...> fmt, T&&... args) {
+  return print(stdout, ts, fmt, std::forward<T>(args)...);
 }
 
-template <typename S, typename Char = char_t<S>>
-inline std::basic_string<Char> vformat(
-    const text_style& ts, const S& format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  basic_memory_buffer<Char> buf;
-  detail::vformat_to(buf, ts, detail::to_string_view(format_str), args);
+inline auto vformat(const text_style& ts, string_view fmt, format_args args)
+    -> std::string {
+  auto buf = memory_buffer();
+  detail::vformat_to(buf, ts, fmt, args);
   return fmt::to_string(buf);
 }
 
 /**
-  \rst
-  Formats arguments and returns the result as a string using ANSI
-  escape sequences to specify text formatting.
-
-  **Example**::
-
-    #include <fmt/color.h>
-    std::string message = fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
-                                      "The answer is {}", 42);
-  \endrst
-*/
-template <typename S, typename... Args, typename Char = char_t<S>>
-inline std::basic_string<Char> format(const text_style& ts, const S& format_str,
-                                      const Args&... args) {
-  return fmt::vformat(ts, detail::to_string_view(format_str),
-                      fmt::make_format_args<buffer_context<Char>>(args...));
+ * Formats arguments and returns the result as a string using ANSI escape
+ * sequences to specify text formatting.
+ *
+ * **Example**:
+ *
+ * ```
+ * #include <fmt/color.h>
+ * std::string message = fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+ *                                   "The answer is {}", 42);
+ * ```
+ */
+template <typename... T>
+inline auto format(const text_style& ts, format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return fmt::vformat(ts, fmt, fmt::make_format_args(args...));
 }
 
-/**
-  Formats a string with the given text_style and writes the output to ``out``.
- */
-template <typename OutputIt, typename Char,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
-OutputIt vformat_to(
-    OutputIt out, const text_style& ts, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  auto&& buf = detail::get_buffer<Char>(out);
-  detail::vformat_to(buf, ts, format_str, args);
+/// Formats a string with the given text_style and writes the output to `out`.
+template <typename OutputIt,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to(OutputIt out, const text_style& ts, string_view fmt,
+                format_args args) -> OutputIt {
+  auto&& buf = detail::get_buffer<char>(out);
+  detail::vformat_to(buf, ts, fmt, args);
   return detail::get_iterator(buf, out);
 }
 
 /**
-  \rst
-  Formats arguments with the given text_style, writes the result to the output
-  iterator ``out`` and returns the iterator past the end of the output range.
-
-  **Example**::
-
-    std::vector<char> out;
-    fmt::format_to(std::back_inserter(out),
-                   fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
-  \endrst
-*/
-template <typename OutputIt, typename S, typename... Args,
-          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value&&
-              detail::is_string<S>::value>
-inline auto format_to(OutputIt out, const text_style& ts, const S& format_str,
-                      Args&&... args) ->
-    typename std::enable_if<enable, OutputIt>::type {
-  return vformat_to(out, ts, detail::to_string_view(format_str),
-                    fmt::make_format_args<buffer_context<char_t<S>>>(args...));
+ * Formats arguments with the given text style, writes the result to the output
+ * iterator `out` and returns the iterator past the end of the output range.
+ *
+ * **Example**:
+ *
+ *     std::vector<char> out;
+ *     fmt::format_to(std::back_inserter(out),
+ *                    fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+inline auto format_to(OutputIt out, const text_style& ts,
+                      format_string<T...> fmt, T&&... args) -> OutputIt {
+  return vformat_to(out, ts, fmt, fmt::make_format_args(args...));
 }
 
 template <typename T, typename Char>
@@ -632,16 +591,14 @@ struct formatter<detail::styled_arg<T>, Char> : formatter<T, Char> {
 };
 
 /**
-  \rst
-  Returns an argument that will be formatted using ANSI escape sequences,
-  to be used in a formatting function.
-
-  **Example**::
-
-    fmt::print("Elapsed time: {0:.2f} seconds",
-               fmt::styled(1.23, fmt::fg(fmt::color::green) |
-                                 fmt::bg(fmt::color::blue)));
-  \endrst
+ * Returns an argument that will be formatted using ANSI escape sequences,
+ * to be used in a formatting function.
+ *
+ * **Example**:
+ *
+ *     fmt::print("Elapsed time: {0:.2f} seconds",
+ *                fmt::styled(1.23, fmt::fg(fmt::color::green) |
+ *                                  fmt::bg(fmt::color::blue)));
  */
 template <typename T>
 FMT_CONSTEXPR auto styled(const T& value, text_style ts)
@@ -649,7 +606,7 @@ FMT_CONSTEXPR auto styled(const T& value, text_style ts)
   return detail::styled_arg<remove_cvref_t<T>>{value, ts};
 }
 
-FMT_MODULE_EXPORT_END
+FMT_END_EXPORT
 FMT_END_NAMESPACE
 
 #endif  // FMT_COLOR_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/compile.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/compile.h
index 933668c41c3e..b2afc2c309f4 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/compile.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/compile.h
@@ -8,117 +8,41 @@
 #ifndef FMT_COMPILE_H_
 #define FMT_COMPILE_H_
 
+#ifndef FMT_MODULE
+#  include <iterator>  // std::back_inserter
+#endif
+
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
+
+// A compile-time string which is compiled into fast formatting code.
+FMT_EXPORT class compiled_string {};
+
 namespace detail {
 
-template <typename Char, typename InputIt>
-FMT_CONSTEXPR inline counting_iterator copy_str(InputIt begin, InputIt end,
-                                                counting_iterator it) {
+template <typename T, typename InputIt>
+FMT_CONSTEXPR inline auto copy(InputIt begin, InputIt end, counting_iterator it)
+    -> counting_iterator {
   return it + (end - begin);
 }
 
-template <typename OutputIt> class truncating_iterator_base {
- protected:
-  OutputIt out_;
-  size_t limit_;
-  size_t count_ = 0;
-
-  truncating_iterator_base() : out_(), limit_(0) {}
-
-  truncating_iterator_base(OutputIt out, size_t limit)
-      : out_(out), limit_(limit) {}
-
- public:
-  using iterator_category = std::output_iterator_tag;
-  using value_type = typename std::iterator_traits<OutputIt>::value_type;
-  using difference_type = std::ptrdiff_t;
-  using pointer = void;
-  using reference = void;
-  FMT_UNCHECKED_ITERATOR(truncating_iterator_base);
-
-  OutputIt base() const { return out_; }
-  size_t count() const { return count_; }
-};
-
-// An output iterator that truncates the output and counts the number of objects
-// written to it.
-template <typename OutputIt,
-          typename Enable = typename std::is_void<
-              typename std::iterator_traits<OutputIt>::value_type>::type>
-class truncating_iterator;
-
-template <typename OutputIt>
-class truncating_iterator<OutputIt, std::false_type>
-    : public truncating_iterator_base<OutputIt> {
-  mutable typename truncating_iterator_base<OutputIt>::value_type blackhole_;
-
- public:
-  using value_type = typename truncating_iterator_base<OutputIt>::value_type;
-
-  truncating_iterator() = default;
-
-  truncating_iterator(OutputIt out, size_t limit)
-      : truncating_iterator_base<OutputIt>(out, limit) {}
-
-  truncating_iterator& operator++() {
-    if (this->count_++ < this->limit_) ++this->out_;
-    return *this;
-  }
-
-  truncating_iterator operator++(int) {
-    auto it = *this;
-    ++*this;
-    return it;
-  }
-
-  value_type& operator*() const {
-    return this->count_ < this->limit_ ? *this->out_ : blackhole_;
-  }
-};
-
-template <typename OutputIt>
-class truncating_iterator<OutputIt, std::true_type>
-    : public truncating_iterator_base<OutputIt> {
- public:
-  truncating_iterator() = default;
-
-  truncating_iterator(OutputIt out, size_t limit)
-      : truncating_iterator_base<OutputIt>(out, limit) {}
-
-  template <typename T> truncating_iterator& operator=(T val) {
-    if (this->count_++ < this->limit_) *this->out_++ = val;
-    return *this;
-  }
-
-  truncating_iterator& operator++() { return *this; }
-  truncating_iterator& operator++(int) { return *this; }
-  truncating_iterator& operator*() { return *this; }
-};
-
-// A compile-time string which is compiled into fast formatting code.
-class compiled_string {};
-
 template <typename S>
 struct is_compiled_string : std::is_base_of<compiled_string, S> {};
 
 /**
-  \rst
-  Converts a string literal *s* into a format string that will be parsed at
-  compile time and converted into efficient formatting code. Requires C++17
-  ``constexpr if`` compiler support.
-
-  **Example**::
-
-    // Converts 42 into std::string using the most efficient method and no
-    // runtime format string processing.
-    std::string s = fmt::format(FMT_COMPILE("{}"), 42);
-  \endrst
+ * Converts a string literal `s` into a format string that will be parsed at
+ * compile time and converted into efficient formatting code. Requires C++17
+ * `constexpr if` compiler support.
+ *
+ * **Example**:
+ *
+ *     // Converts 42 into std::string using the most efficient method and no
+ *     // runtime format string processing.
+ *     std::string s = fmt::format(FMT_COMPILE("{}"), 42);
  */
 #if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
-#  define FMT_COMPILE(s) \
-    FMT_STRING_IMPL(s, fmt::detail::compiled_string, explicit)
+#  define FMT_COMPILE(s) FMT_STRING_IMPL(s, fmt::compiled_string, explicit)
 #else
 #  define FMT_COMPILE(s) FMT_STRING(s)
 #endif
@@ -135,7 +59,7 @@ struct udl_compiled_string : compiled_string {
 #endif
 
 template <typename T, typename... Tail>
-const T& first(const T& value, const Tail&...) {
+auto first(const T& value, const Tail&...) -> const T& {
   return value;
 }
 
@@ -196,7 +120,8 @@ template <typename Char> struct code_unit {
 
   template <typename OutputIt, typename... Args>
   constexpr OutputIt format(OutputIt out, const Args&...) const {
-    return write<Char>(out, value);
+    *out++ = value;
+    return out;
   }
 };
 
@@ -220,7 +145,12 @@ template <typename Char, typename T, int N> struct field {
 
   template <typename OutputIt, typename... Args>
   constexpr OutputIt format(OutputIt out, const Args&... args) const {
-    return write<Char>(out, get_arg_checked<T, N>(args...));
+    const T& arg = get_arg_checked<T, N>(args...);
+    if constexpr (std::is_convertible<T, basic_string_view<Char>>::value) {
+      auto s = basic_string_view<Char>(arg);
+      return copy<Char>(s.begin(), s.end(), out);
+    }
+    return write<Char>(out, arg);
   }
 };
 
@@ -308,13 +238,12 @@ constexpr size_t parse_text(basic_string_view<Char> str, size_t pos) {
 }
 
 template <typename Args, size_t POS, int ID, typename S>
-constexpr auto compile_format_string(S format_str);
+constexpr auto compile_format_string(S fmt);
 
 template <typename Args, size_t POS, int ID, typename T, typename S>
-constexpr auto parse_tail(T head, S format_str) {
-  if constexpr (POS !=
-                basic_string_view<typename S::char_type>(format_str).size()) {
-    constexpr auto tail = compile_format_string<Args, POS, ID>(format_str);
+constexpr auto parse_tail(T head, S fmt) {
+  if constexpr (POS != basic_string_view<typename S::char_type>(fmt).size()) {
+    constexpr auto tail = compile_format_string<Args, POS, ID>(fmt);
     if constexpr (std::is_same<remove_cvref_t<decltype(tail)>,
                                unknown_format>())
       return tail;
@@ -331,14 +260,14 @@ template <typename T, typename Char> struct parse_specs_result {
   int next_arg_id;
 };
 
-constexpr int manual_indexing_id = -1;
+enum { manual_indexing_id = -1 };
 
 template <typename T, typename Char>
 constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
                                                   size_t pos, int next_arg_id) {
   str.remove_prefix(pos);
-  auto ctx = compile_parse_context<Char>(str, max_value<int>(), nullptr, {},
-                                         next_arg_id);
+  auto ctx =
+      compile_parse_context<Char>(str, max_value<int>(), nullptr, next_arg_id);
   auto f = formatter<T, Char>();
   auto end = f.parse(ctx);
   return {f, pos + fmt::detail::to_unsigned(end - str.data()),
@@ -348,22 +277,18 @@ constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
 template <typename Char> struct arg_id_handler {
   arg_ref<Char> arg_id;
 
-  constexpr int operator()() {
+  constexpr int on_auto() {
     FMT_ASSERT(false, "handler cannot be used with automatic indexing");
     return 0;
   }
-  constexpr int operator()(int id) {
+  constexpr int on_index(int id) {
     arg_id = arg_ref<Char>(id);
     return 0;
   }
-  constexpr int operator()(basic_string_view<Char> id) {
+  constexpr int on_name(basic_string_view<Char> id) {
     arg_id = arg_ref<Char>(id);
     return 0;
   }
-
-  constexpr void on_error(const char* message) {
-    FMT_THROW(format_error(message));
-  }
 };
 
 template <typename Char> struct parse_arg_id_result {
@@ -389,14 +314,13 @@ struct field_type<T, enable_if_t<detail::is_named_arg<T>::value>> {
 
 template <typename T, typename Args, size_t END_POS, int ARG_INDEX, int NEXT_ID,
           typename S>
-constexpr auto parse_replacement_field_then_tail(S format_str) {
+constexpr auto parse_replacement_field_then_tail(S fmt) {
   using char_type = typename S::char_type;
-  constexpr auto str = basic_string_view<char_type>(format_str);
+  constexpr auto str = basic_string_view<char_type>(fmt);
   constexpr char_type c = END_POS != str.size() ? str[END_POS] : char_type();
   if constexpr (c == '}') {
     return parse_tail<Args, END_POS + 1, NEXT_ID>(
-        field<char_type, typename field_type<T>::type, ARG_INDEX>(),
-        format_str);
+        field<char_type, typename field_type<T>::type, ARG_INDEX>(), fmt);
   } else if constexpr (c != ':') {
     FMT_THROW(format_error("expected ':'"));
   } else {
@@ -409,7 +333,7 @@ constexpr auto parse_replacement_field_then_tail(S format_str) {
       return parse_tail<Args, result.end + 1, result.next_arg_id>(
           spec_field<char_type, typename field_type<T>::type, ARG_INDEX>{
               result.fmt},
-          format_str);
+          fmt);
     }
   }
 }
@@ -417,22 +341,21 @@ constexpr auto parse_replacement_field_then_tail(S format_str) {
 // Compiles a non-empty format string and returns the compiled representation
 // or unknown_format() on unrecognized input.
 template <typename Args, size_t POS, int ID, typename S>
-constexpr auto compile_format_string(S format_str) {
+constexpr auto compile_format_string(S fmt) {
   using char_type = typename S::char_type;
-  constexpr auto str = basic_string_view<char_type>(format_str);
+  constexpr auto str = basic_string_view<char_type>(fmt);
   if constexpr (str[POS] == '{') {
     if constexpr (POS + 1 == str.size())
       FMT_THROW(format_error("unmatched '{' in format string"));
     if constexpr (str[POS + 1] == '{') {
-      return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
+      return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), fmt);
     } else if constexpr (str[POS + 1] == '}' || str[POS + 1] == ':') {
       static_assert(ID != manual_indexing_id,
                     "cannot switch from manual to automatic argument indexing");
       constexpr auto next_id =
           ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
       return parse_replacement_field_then_tail<get_type<ID, Args>, Args,
-                                               POS + 1, ID, next_id>(
-          format_str);
+                                               POS + 1, ID, next_id>(fmt);
     } else {
       constexpr auto arg_id_result =
           parse_arg_id<ID>(str.data() + POS + 1, str.data() + str.size());
@@ -448,60 +371,55 @@ constexpr auto compile_format_string(S format_str) {
         return parse_replacement_field_then_tail<get_type<arg_index, Args>,
                                                  Args, arg_id_end_pos,
                                                  arg_index, manual_indexing_id>(
-            format_str);
+            fmt);
       } else if constexpr (arg_id_result.arg_id.kind == arg_id_kind::name) {
         constexpr auto arg_index =
             get_arg_index_by_name(arg_id_result.arg_id.val.name, Args{});
-        if constexpr (arg_index != invalid_arg_index) {
+        if constexpr (arg_index >= 0) {
           constexpr auto next_id =
               ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
           return parse_replacement_field_then_tail<
               decltype(get_type<arg_index, Args>::value), Args, arg_id_end_pos,
-              arg_index, next_id>(format_str);
-        } else {
-          if constexpr (c == '}') {
-            return parse_tail<Args, arg_id_end_pos + 1, ID>(
-                runtime_named_field<char_type>{arg_id_result.arg_id.val.name},
-                format_str);
-          } else if constexpr (c == ':') {
-            return unknown_format();  // no type info for specs parsing
-          }
+              arg_index, next_id>(fmt);
+        } else if constexpr (c == '}') {
+          return parse_tail<Args, arg_id_end_pos + 1, ID>(
+              runtime_named_field<char_type>{arg_id_result.arg_id.val.name},
+              fmt);
+        } else if constexpr (c == ':') {
+          return unknown_format();  // no type info for specs parsing
         }
       }
     }
   } else if constexpr (str[POS] == '}') {
     if constexpr (POS + 1 == str.size())
       FMT_THROW(format_error("unmatched '}' in format string"));
-    return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
+    return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), fmt);
   } else {
     constexpr auto end = parse_text(str, POS + 1);
     if constexpr (end - POS > 1) {
-      return parse_tail<Args, end, ID>(make_text(str, POS, end - POS),
-                                       format_str);
+      return parse_tail<Args, end, ID>(make_text(str, POS, end - POS), fmt);
     } else {
-      return parse_tail<Args, end, ID>(code_unit<char_type>{str[POS]},
-                                       format_str);
+      return parse_tail<Args, end, ID>(code_unit<char_type>{str[POS]}, fmt);
     }
   }
 }
 
 template <typename... Args, typename S,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-constexpr auto compile(S format_str) {
-  constexpr auto str = basic_string_view<typename S::char_type>(format_str);
+constexpr auto compile(S fmt) {
+  constexpr auto str = basic_string_view<typename S::char_type>(fmt);
   if constexpr (str.size() == 0) {
     return detail::make_text(str, 0, 0);
   } else {
     constexpr auto result =
-        detail::compile_format_string<detail::type_list<Args...>, 0, 0>(
-            format_str);
+        detail::compile_format_string<detail::type_list<Args...>, 0, 0>(fmt);
     return result;
   }
 }
 #endif  // defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
 }  // namespace detail
 
-FMT_MODULE_EXPORT_BEGIN
+FMT_BEGIN_EXPORT
 
 #if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
 
@@ -566,33 +484,33 @@ FMT_CONSTEXPR OutputIt format_to(OutputIt out, const S&, Args&&... args) {
 
 template <typename OutputIt, typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
-                                         const S& format_str, Args&&... args) {
-  auto it = fmt::format_to(detail::truncating_iterator<OutputIt>(out, n),
-                           format_str, std::forward<Args>(args)...);
-  return {it.base(), it.count()};
+auto format_to_n(OutputIt out, size_t n, const S& fmt, Args&&... args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  fmt::format_to(std::back_inserter(buf), fmt, std::forward<Args>(args)...);
+  return {buf.out(), buf.count()};
 }
 
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-FMT_CONSTEXPR20 size_t formatted_size(const S& format_str,
-                                      const Args&... args) {
-  return fmt::format_to(detail::counting_iterator(), format_str, args...)
-      .count();
+FMT_CONSTEXPR20 auto formatted_size(const S& fmt, const Args&... args)
+    -> size_t {
+  return fmt::format_to(detail::counting_iterator(), fmt, args...).count();
 }
 
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-void print(std::FILE* f, const S& format_str, const Args&... args) {
+void print(std::FILE* f, const S& fmt, const Args&... args) {
   memory_buffer buffer;
-  fmt::format_to(std::back_inserter(buffer), format_str, args...);
+  fmt::format_to(std::back_inserter(buffer), fmt, args...);
   detail::print(f, {buffer.data(), buffer.size()});
 }
 
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
-void print(const S& format_str, const Args&... args) {
-  print(stdout, format_str, args...);
+void print(const S& fmt, const Args&... args) {
+  print(stdout, fmt, args...);
 }
 
 #if FMT_USE_NONTYPE_TEMPLATE_ARGS
@@ -605,7 +523,7 @@ template <detail_exported::fixed_string Str> constexpr auto operator""_cf() {
 }  // namespace literals
 #endif
 
-FMT_MODULE_EXPORT_END
+FMT_END_EXPORT
 FMT_END_NAMESPACE
 
 #endif  // FMT_COMPILE_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/core.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/core.h
index af61b22c44ec..8ca735f0c004 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/core.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/core.h
@@ -1,3343 +1,5 @@
-// Formatting library for C++ - the core API for char/UTF-8
-//
-// Copyright (c) 2012 - present, Victor Zverovich
-// All rights reserved.
-//
-// For the license information refer to format.h.
+// This file is only provided for compatibility and may be removed in future
+// versions. Use fmt/base.h if you don't need fmt::format and fmt/format.h
+// otherwise.
 
-#ifndef FMT_CORE_H_
-#define FMT_CORE_H_
-
-#include <cstddef>  // std::byte
-#include <cstdio>   // std::FILE
-#include <cstring>  // std::strlen
-#include <iterator>
-#include <limits>
-#include <string>
-#include <type_traits>
-
-// The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 90101
-
-#define FMT_HEADER_ONLY
-
-#if defined(__clang__) && !defined(__ibmxl__)
-#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
-#else
-#  define FMT_CLANG_VERSION 0
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \
-    !defined(__NVCOMPILER)
-#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#else
-#  define FMT_GCC_VERSION 0
-#endif
-
-#ifndef FMT_GCC_PRAGMA
-// Workaround _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884.
-#  if FMT_GCC_VERSION >= 504
-#    define FMT_GCC_PRAGMA(arg) _Pragma(arg)
-#  else
-#    define FMT_GCC_PRAGMA(arg)
-#  endif
-#endif
-
-#ifdef __ICL
-#  define FMT_ICC_VERSION __ICL
-#elif defined(__INTEL_COMPILER)
-#  define FMT_ICC_VERSION __INTEL_COMPILER
-#else
-#  define FMT_ICC_VERSION 0
-#endif
-
-#ifdef _MSC_VER
-#  define FMT_MSC_VERSION _MSC_VER
-#  define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
-#else
-#  define FMT_MSC_VERSION 0
-#  define FMT_MSC_WARNING(...)
-#endif
-
-#ifdef _MSVC_LANG
-#  define FMT_CPLUSPLUS _MSVC_LANG
-#else
-#  define FMT_CPLUSPLUS __cplusplus
-#endif
-
-#ifdef __has_feature
-#  define FMT_HAS_FEATURE(x) __has_feature(x)
-#else
-#  define FMT_HAS_FEATURE(x) 0
-#endif
-
-#if defined(__has_include) || FMT_ICC_VERSION >= 1600 || FMT_MSC_VERSION > 1900
-#  define FMT_HAS_INCLUDE(x) __has_include(x)
-#else
-#  define FMT_HAS_INCLUDE(x) 0
-#endif
-
-#ifdef __has_cpp_attribute
-#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#else
-#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
-#endif
-
-#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
-  (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
-
-#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
-  (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
-
-// Check if relaxed C++14 constexpr is supported.
-// GCC doesn't allow throw in constexpr until version 6 (bug 67371).
-#ifndef FMT_USE_CONSTEXPR
-#  if (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912 || \
-       (FMT_GCC_VERSION >= 600 && FMT_CPLUSPLUS >= 201402L)) &&             \
-      !FMT_ICC_VERSION && !defined(__NVCC__)
-#    define FMT_USE_CONSTEXPR 1
-#  else
-#    define FMT_USE_CONSTEXPR 0
-#  endif
-#endif
-#if FMT_USE_CONSTEXPR
-#  define FMT_CONSTEXPR constexpr
-#else
-#  define FMT_CONSTEXPR
-#endif
-
-#if ((FMT_CPLUSPLUS >= 202002L) &&                            \
-     (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE > 9)) || \
-    (FMT_CPLUSPLUS >= 201709L && FMT_GCC_VERSION >= 1002)
-#  define FMT_CONSTEXPR20 constexpr
-#else
-#  define FMT_CONSTEXPR20
-#endif
-
-// Check if constexpr std::char_traits<>::{compare,length} are supported.
-#if defined(__GLIBCXX__)
-#  if FMT_CPLUSPLUS >= 201703L && defined(_GLIBCXX_RELEASE) && \
-      _GLIBCXX_RELEASE >= 7  // GCC 7+ libstdc++ has _GLIBCXX_RELEASE.
-#    define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#  endif
-#elif defined(_LIBCPP_VERSION) && FMT_CPLUSPLUS >= 201703L && \
-    _LIBCPP_VERSION >= 4000
-#  define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#elif FMT_MSC_VERSION >= 1914 && FMT_CPLUSPLUS >= 201703L
-#  define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#endif
-#ifndef FMT_CONSTEXPR_CHAR_TRAITS
-#  define FMT_CONSTEXPR_CHAR_TRAITS
-#endif
-
-// Check if exceptions are disabled.
-#ifndef FMT_EXCEPTIONS
-#  if (defined(__GNUC__) && !defined(__EXCEPTIONS)) || \
-      (FMT_MSC_VERSION && !_HAS_EXCEPTIONS)
-#    define FMT_EXCEPTIONS 0
-#  else
-#    define FMT_EXCEPTIONS 1
-#  endif
-#endif
-
-#ifndef FMT_DEPRECATED
-#  if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VERSION >= 1900
-#    define FMT_DEPRECATED [[deprecated]]
-#  else
-#    if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__)
-#      define FMT_DEPRECATED __attribute__((deprecated))
-#    elif FMT_MSC_VERSION
-#      define FMT_DEPRECATED __declspec(deprecated)
-#    else
-#      define FMT_DEPRECATED /* deprecated */
-#    endif
-#  endif
-#endif
-
-// [[noreturn]] is disabled on MSVC and NVCC because of bogus unreachable code
-// warnings.
-#if FMT_EXCEPTIONS && FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION && \
-    !defined(__NVCC__)
-#  define FMT_NORETURN [[noreturn]]
-#else
-#  define FMT_NORETURN
-#endif
-
-#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
-#  define FMT_FALLTHROUGH [[fallthrough]]
-#elif defined(__clang__)
-#  define FMT_FALLTHROUGH [[clang::fallthrough]]
-#elif FMT_GCC_VERSION >= 700 && \
-    (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
-#  define FMT_FALLTHROUGH [[gnu::fallthrough]]
-#else
-#  define FMT_FALLTHROUGH
-#endif
-
-#ifndef FMT_NODISCARD
-#  if FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
-#    define FMT_NODISCARD [[nodiscard]]
-#  else
-#    define FMT_NODISCARD
-#  endif
-#endif
-
-#ifndef FMT_USE_FLOAT
-#  define FMT_USE_FLOAT 1
-#endif
-#ifndef FMT_USE_DOUBLE
-#  define FMT_USE_DOUBLE 1
-#endif
-#ifndef FMT_USE_LONG_DOUBLE
-#  define FMT_USE_LONG_DOUBLE 1
-#endif
-
-#ifndef FMT_INLINE
-#  if FMT_GCC_VERSION || FMT_CLANG_VERSION
-#    define FMT_INLINE inline __attribute__((always_inline))
-#  else
-#    define FMT_INLINE inline
-#  endif
-#endif
-
-// An inline std::forward replacement.
-#define FMT_FORWARD(...) static_cast<decltype(__VA_ARGS__)&&>(__VA_ARGS__)
-
-#ifdef _MSC_VER
-#  define FMT_UNCHECKED_ITERATOR(It) \
-    using _Unchecked_type = It  // Mark iterator as checked.
-#else
-#  define FMT_UNCHECKED_ITERATOR(It) using unchecked_type = It
-#endif
-
-#ifndef FMT_BEGIN_NAMESPACE
-#  define FMT_BEGIN_NAMESPACE \
-    namespace fmt {           \
-    inline namespace v9 {
-#  define FMT_END_NAMESPACE \
-    }                       \
-    }
-#endif
-
-#ifndef FMT_MODULE_EXPORT
-#  define FMT_MODULE_EXPORT
-#  define FMT_MODULE_EXPORT_BEGIN
-#  define FMT_MODULE_EXPORT_END
-#  define FMT_BEGIN_DETAIL_NAMESPACE namespace detail {
-#  define FMT_END_DETAIL_NAMESPACE }
-#endif
-
-#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
-#  define FMT_CLASS_API FMT_MSC_WARNING(suppress : 4275)
-#  ifdef FMT_EXPORT
-#    define FMT_API __declspec(dllexport)
-#  elif defined(FMT_SHARED)
-#    define FMT_API __declspec(dllimport)
-#  endif
-#else
-#  define FMT_CLASS_API
-#  if defined(FMT_EXPORT) || defined(FMT_SHARED)
-#    if defined(__GNUC__) || defined(__clang__)
-#      define FMT_API __attribute__((visibility("default")))
-#    endif
-#  endif
-#endif
-#ifndef FMT_API
-#  define FMT_API
-#endif
-
-// libc++ supports string_view in pre-c++17.
-#if FMT_HAS_INCLUDE(<string_view>) && \
-    (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
-#  include <string_view>
-#  define FMT_USE_STRING_VIEW
-#elif FMT_HAS_INCLUDE("experimental/string_view") && FMT_CPLUSPLUS >= 201402L
-#  include <experimental/string_view>
-#  define FMT_USE_EXPERIMENTAL_STRING_VIEW
-#endif
-
-#ifndef FMT_UNICODE
-#  define FMT_UNICODE !FMT_MSC_VERSION
-#endif
-
-#ifndef FMT_CONSTEVAL
-#  if ((FMT_GCC_VERSION >= 1000 || FMT_CLANG_VERSION >= 1101) &&         \
-       FMT_CPLUSPLUS >= 202002L && !defined(__apple_build_version__)) || \
-      (defined(__cpp_consteval) &&                                       \
-       (!FMT_MSC_VERSION || _MSC_FULL_VER >= 193030704))
-// consteval is broken in MSVC before VS2022 and Apple clang 13.
-#    define FMT_CONSTEVAL consteval
-#    define FMT_HAS_CONSTEVAL
-#  else
-#    define FMT_CONSTEVAL
-#  endif
-#endif
-
-#ifndef FMT_USE_NONTYPE_TEMPLATE_ARGS
-#  if defined(__cpp_nontype_template_args) &&                  \
-      ((FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L) || \
-       __cpp_nontype_template_args >= 201911L) &&              \
-      !defined(__NVCOMPILER) && !defined(__LCC__)
-#    define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
-#  else
-#    define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
-#  endif
-#endif
-
-// Enable minimal optimizations for more compact code in debug mode.
-FMT_GCC_PRAGMA("GCC push_options")
-#if !defined(__OPTIMIZE__) && !defined(__NVCOMPILER) && !defined(__LCC__)
-FMT_GCC_PRAGMA("GCC optimize(\"Og\")")
-#endif
-
-FMT_BEGIN_NAMESPACE
-FMT_MODULE_EXPORT_BEGIN
-
-// Implementations of enable_if_t and other metafunctions for older systems.
-template <bool B, typename T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-template <bool B, typename T, typename F>
-using conditional_t = typename std::conditional<B, T, F>::type;
-template <bool B> using bool_constant = std::integral_constant<bool, B>;
-template <typename T>
-using remove_reference_t = typename std::remove_reference<T>::type;
-template <typename T>
-using remove_const_t = typename std::remove_const<T>::type;
-template <typename T>
-using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
-template <typename T> struct type_identity { using type = T; };
-template <typename T> using type_identity_t = typename type_identity<T>::type;
-template <typename T>
-using underlying_t = typename std::underlying_type<T>::type;
-
-template <typename...> struct disjunction : std::false_type {};
-template <typename P> struct disjunction<P> : P {};
-template <typename P1, typename... Pn>
-struct disjunction<P1, Pn...>
-    : conditional_t<bool(P1::value), P1, disjunction<Pn...>> {};
-
-template <typename...> struct conjunction : std::true_type {};
-template <typename P> struct conjunction<P> : P {};
-template <typename P1, typename... Pn>
-struct conjunction<P1, Pn...>
-    : conditional_t<bool(P1::value), conjunction<Pn...>, P1> {};
-
-struct monostate {
-  constexpr monostate() {}
-};
-
-// An enable_if helper to be used in template parameters which results in much
-// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
-// to workaround a bug in MSVC 2019 (see #1140 and #1186).
-#ifdef FMT_DOC
-#  define FMT_ENABLE_IF(...)
-#else
-#  define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0
-#endif
-
-FMT_BEGIN_DETAIL_NAMESPACE
-
-// Suppresses "unused variable" warnings with the method described in
-// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
-// (void)var does not work on many Intel compilers.
-template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
-
-constexpr FMT_INLINE auto is_constant_evaluated(
-    bool default_value = false) noexcept -> bool {
-#ifdef __cpp_lib_is_constant_evaluated
-  ignore_unused(default_value);
-  return std::is_constant_evaluated();
-#else
-  return default_value;
-#endif
-}
-
-// Suppresses "conditional expression is constant" warnings.
-template <typename T> constexpr FMT_INLINE auto const_check(T value) -> T {
-  return value;
-}
-
-FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
-                                      const char* message);
-
-#ifndef FMT_ASSERT
-#  ifdef NDEBUG
-// FMT_ASSERT is not empty to avoid -Wempty-body.
-#    define FMT_ASSERT(condition, message) \
-      ::fmt::detail::ignore_unused((condition), (message))
-#  else
-#    define FMT_ASSERT(condition, message)                                    \
-      ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
-           ? (void)0                                                          \
-           : ::fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
-#  endif
-#endif
-
-#if defined(FMT_USE_STRING_VIEW)
-template <typename Char> using std_string_view = std::basic_string_view<Char>;
-#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
-template <typename Char>
-using std_string_view = std::experimental::basic_string_view<Char>;
-#else
-template <typename T> struct std_string_view {};
-#endif
-
-#ifdef FMT_USE_INT128
-// Do nothing.
-#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \
-    !(FMT_CLANG_VERSION && FMT_MSC_VERSION)
-#  define FMT_USE_INT128 1
-using int128_opt = __int128_t;  // An optional native 128-bit integer.
-using uint128_opt = __uint128_t;
-template <typename T> inline auto convert_for_visit(T value) -> T {
-  return value;
-}
-#else
-#  define FMT_USE_INT128 0
-#endif
-#if !FMT_USE_INT128
-enum class int128_opt {};
-enum class uint128_opt {};
-// Reduce template instantiations.
-template <typename T> auto convert_for_visit(T) -> monostate { return {}; }
-#endif
-
-// Casts a nonnegative integer to unsigned.
-template <typename Int>
-FMT_CONSTEXPR auto to_unsigned(Int value) ->
-    typename std::make_unsigned<Int>::type {
-  return static_cast<typename std::make_unsigned<Int>::type>(value);
-}
-
-FMT_MSC_WARNING(suppress : 4566) constexpr unsigned char micro[] = "\u00B5";
-
-constexpr auto is_utf8() -> bool {
-  // Avoid buggy sign extensions in MSVC's constant evaluation mode (#2297).
-  using uchar = unsigned char;
-  return FMT_UNICODE || (sizeof(micro) == 3 && uchar(micro[0]) == 0xC2 &&
-                         uchar(micro[1]) == 0xB5);
-}
-FMT_END_DETAIL_NAMESPACE
-
-/**
-  An implementation of ``std::basic_string_view`` for pre-C++17. It provides a
-  subset of the API. ``fmt::basic_string_view`` is used for format strings even
-  if ``std::string_view`` is available to prevent issues when a library is
-  compiled with a different ``-std`` option than the client code (which is not
-  recommended).
- */
-template <typename Char> class basic_string_view {
- private:
-  const Char* data_;
-  size_t size_;
-
- public:
-  using value_type = Char;
-  using iterator = const Char*;
-
-  constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {}
-
-  /** Constructs a string reference object from a C string and a size. */
-  constexpr basic_string_view(const Char* s, size_t count) noexcept
-      : data_(s), size_(count) {}
-
-  /**
-    \rst
-    Constructs a string reference object from a C string computing
-    the size with ``std::char_traits<Char>::length``.
-    \endrst
-   */
-  FMT_CONSTEXPR_CHAR_TRAITS
-  FMT_INLINE
-  basic_string_view(const Char* s)
-      : data_(s),
-        size_(detail::const_check(std::is_same<Char, char>::value &&
-                                  !detail::is_constant_evaluated(true))
-                  ? std::strlen(reinterpret_cast<const char*>(s))
-                  : std::char_traits<Char>::length(s)) {}
-
-  /** Constructs a string reference from a ``std::basic_string`` object. */
-  template <typename Traits, typename Alloc>
-  FMT_CONSTEXPR basic_string_view(
-      const std::basic_string<Char, Traits, Alloc>& s) noexcept
-      : data_(s.data()), size_(s.size()) {}
-
-  template <typename S, FMT_ENABLE_IF(std::is_same<
-                                      S, detail::std_string_view<Char>>::value)>
-  FMT_CONSTEXPR basic_string_view(S s) noexcept
-      : data_(s.data()), size_(s.size()) {}
-
-  /** Returns a pointer to the string data. */
-  constexpr auto data() const noexcept -> const Char* { return data_; }
-
-  /** Returns the string size. */
-  constexpr auto size() const noexcept -> size_t { return size_; }
-
-  constexpr auto begin() const noexcept -> iterator { return data_; }
-  constexpr auto end() const noexcept -> iterator { return data_ + size_; }
-
-  constexpr auto operator[](size_t pos) const noexcept -> const Char& {
-    return data_[pos];
-  }
-
-  FMT_CONSTEXPR void remove_prefix(size_t n) noexcept {
-    data_ += n;
-    size_ -= n;
-  }
-
-  FMT_CONSTEXPR_CHAR_TRAITS bool starts_with(
-      basic_string_view<Char> sv) const noexcept {
-    return size_ >= sv.size_ &&
-           std::char_traits<Char>::compare(data_, sv.data_, sv.size_) == 0;
-  }
-  FMT_CONSTEXPR_CHAR_TRAITS bool starts_with(Char c) const noexcept {
-    return size_ >= 1 && std::char_traits<Char>::eq(*data_, c);
-  }
-  FMT_CONSTEXPR_CHAR_TRAITS bool starts_with(const Char* s) const {
-    return starts_with(basic_string_view<Char>(s));
-  }
-
-  // Lexicographically compare this string reference to other.
-  FMT_CONSTEXPR_CHAR_TRAITS auto compare(basic_string_view other) const -> int {
-    size_t str_size = size_ < other.size_ ? size_ : other.size_;
-    int result = std::char_traits<Char>::compare(data_, other.data_, str_size);
-    if (result == 0)
-      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
-    return result;
-  }
-
-  FMT_CONSTEXPR_CHAR_TRAITS friend auto operator==(basic_string_view lhs,
-                                                   basic_string_view rhs)
-      -> bool {
-    return lhs.compare(rhs) == 0;
-  }
-  friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) != 0;
-  }
-  friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) < 0;
-  }
-  friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) <= 0;
-  }
-  friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) > 0;
-  }
-  friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool {
-    return lhs.compare(rhs) >= 0;
-  }
-};
-
-using string_view = basic_string_view<char>;
-
-/** Specifies if ``T`` is a character type. Can be specialized by users. */
-template <typename T> struct is_char : std::false_type {};
-template <> struct is_char<char> : std::true_type {};
-
-FMT_BEGIN_DETAIL_NAMESPACE
-
-// A base class for compile-time strings.
-struct compile_string {};
-
-template <typename S>
-struct is_compile_string : std::is_base_of<compile_string, S> {};
-
-// Returns a string view of `s`.
-template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
-FMT_INLINE auto to_string_view(const Char* s) -> basic_string_view<Char> {
-  return s;
-}
-template <typename Char, typename Traits, typename Alloc>
-inline auto to_string_view(const std::basic_string<Char, Traits, Alloc>& s)
-    -> basic_string_view<Char> {
-  return s;
-}
-template <typename Char>
-constexpr auto to_string_view(basic_string_view<Char> s)
-    -> basic_string_view<Char> {
-  return s;
-}
-template <typename Char,
-          FMT_ENABLE_IF(!std::is_empty<std_string_view<Char>>::value)>
-inline auto to_string_view(std_string_view<Char> s) -> basic_string_view<Char> {
-  return s;
-}
-template <typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
-constexpr auto to_string_view(const S& s)
-    -> basic_string_view<typename S::char_type> {
-  return basic_string_view<typename S::char_type>(s);
-}
-void to_string_view(...);
-
-// Specifies whether S is a string type convertible to fmt::basic_string_view.
-// It should be a constexpr function but MSVC 2017 fails to compile it in
-// enable_if and MSVC 2015 fails to compile it as an alias template.
-// ADL invocation of to_string_view is DEPRECATED!
-template <typename S>
-struct is_string : std::is_class<decltype(to_string_view(std::declval<S>()))> {
-};
-
-template <typename S, typename = void> struct char_t_impl {};
-template <typename S> struct char_t_impl<S, enable_if_t<is_string<S>::value>> {
-  using result = decltype(to_string_view(std::declval<S>()));
-  using type = typename result::value_type;
-};
-
-enum class type {
-  none_type,
-  // Integer types should go first,
-  int_type,
-  uint_type,
-  long_long_type,
-  ulong_long_type,
-  int128_type,
-  uint128_type,
-  bool_type,
-  char_type,
-  last_integer_type = char_type,
-  // followed by floating-point types.
-  float_type,
-  double_type,
-  long_double_type,
-  last_numeric_type = long_double_type,
-  cstring_type,
-  string_type,
-  pointer_type,
-  custom_type
-};
-
-// Maps core type T to the corresponding type enum constant.
-template <typename T, typename Char>
-struct type_constant : std::integral_constant<type, type::custom_type> {};
-
-#define FMT_TYPE_CONSTANT(Type, constant) \
-  template <typename Char>                \
-  struct type_constant<Type, Char>        \
-      : std::integral_constant<type, type::constant> {}
-
-FMT_TYPE_CONSTANT(int, int_type);
-FMT_TYPE_CONSTANT(unsigned, uint_type);
-FMT_TYPE_CONSTANT(long long, long_long_type);
-FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
-FMT_TYPE_CONSTANT(int128_opt, int128_type);
-FMT_TYPE_CONSTANT(uint128_opt, uint128_type);
-FMT_TYPE_CONSTANT(bool, bool_type);
-FMT_TYPE_CONSTANT(Char, char_type);
-FMT_TYPE_CONSTANT(float, float_type);
-FMT_TYPE_CONSTANT(double, double_type);
-FMT_TYPE_CONSTANT(long double, long_double_type);
-FMT_TYPE_CONSTANT(const Char*, cstring_type);
-FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
-FMT_TYPE_CONSTANT(const void*, pointer_type);
-
-constexpr bool is_integral_type(type t) {
-  return t > type::none_type && t <= type::last_integer_type;
-}
-
-constexpr bool is_arithmetic_type(type t) {
-  return t > type::none_type && t <= type::last_numeric_type;
-}
-
-FMT_NORETURN FMT_API void throw_format_error(const char* message);
-
-struct error_handler {
-  constexpr error_handler() = default;
-  constexpr error_handler(const error_handler&) = default;
-
-  // This function is intentionally not constexpr to give a compile-time error.
-  FMT_NORETURN void on_error(const char* message) {
-    throw_format_error(message);
-  }
-};
-FMT_END_DETAIL_NAMESPACE
-
-/** String's character type. */
-template <typename S> using char_t = typename detail::char_t_impl<S>::type;
-
-/**
-  \rst
-  Parsing context consisting of a format string range being parsed and an
-  argument counter for automatic indexing.
-  You can use the ``format_parse_context`` type alias for ``char`` instead.
-  \endrst
- */
-template <typename Char, typename ErrorHandler = detail::error_handler>
-class basic_format_parse_context : private ErrorHandler {
- private:
-  basic_string_view<Char> format_str_;
-  int next_arg_id_;
-
-  FMT_CONSTEXPR void do_check_arg_id(int id);
-
- public:
-  using char_type = Char;
-  using iterator = typename basic_string_view<Char>::iterator;
-
-  explicit constexpr basic_format_parse_context(
-      basic_string_view<Char> format_str, ErrorHandler eh = {},
-      int next_arg_id = 0)
-      : ErrorHandler(eh), format_str_(format_str), next_arg_id_(next_arg_id) {}
-
-  /**
-    Returns an iterator to the beginning of the format string range being
-    parsed.
-   */
-  constexpr auto begin() const noexcept -> iterator {
-    return format_str_.begin();
-  }
-
-  /**
-    Returns an iterator past the end of the format string range being parsed.
-   */
-  constexpr auto end() const noexcept -> iterator { return format_str_.end(); }
-
-  /** Advances the begin iterator to ``it``. */
-  FMT_CONSTEXPR void advance_to(iterator it) {
-    format_str_.remove_prefix(detail::to_unsigned(it - begin()));
-  }
-
-  /**
-    Reports an error if using the manual argument indexing; otherwise returns
-    the next argument index and switches to the automatic indexing.
-   */
-  FMT_CONSTEXPR auto next_arg_id() -> int {
-    if (next_arg_id_ < 0) {
-      on_error("cannot switch from manual to automatic argument indexing");
-      return 0;
-    }
-    int id = next_arg_id_++;
-    do_check_arg_id(id);
-    return id;
-  }
-
-  /**
-    Reports an error if using the automatic argument indexing; otherwise
-    switches to the manual indexing.
-   */
-  FMT_CONSTEXPR void check_arg_id(int id) {
-    if (next_arg_id_ > 0) {
-      on_error("cannot switch from automatic to manual argument indexing");
-      return;
-    }
-    next_arg_id_ = -1;
-    do_check_arg_id(id);
-  }
-  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {}
-  FMT_CONSTEXPR void check_dynamic_spec(int arg_id);
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    ErrorHandler::on_error(message);
-  }
-
-  constexpr auto error_handler() const -> ErrorHandler { return *this; }
-};
-
-using format_parse_context = basic_format_parse_context<char>;
-
-FMT_BEGIN_DETAIL_NAMESPACE
-// A parse context with extra data used only in compile-time checks.
-template <typename Char, typename ErrorHandler = detail::error_handler>
-class compile_parse_context
-    : public basic_format_parse_context<Char, ErrorHandler> {
- private:
-  int num_args_;
-  const type* types_;
-  using base = basic_format_parse_context<Char, ErrorHandler>;
-
- public:
-  explicit FMT_CONSTEXPR compile_parse_context(
-      basic_string_view<Char> format_str, int num_args, const type* types,
-      ErrorHandler eh = {}, int next_arg_id = 0)
-      : base(format_str, eh, next_arg_id), num_args_(num_args), types_(types) {}
-
-  constexpr auto num_args() const -> int { return num_args_; }
-  constexpr auto arg_type(int id) const -> type { return types_[id]; }
-
-  FMT_CONSTEXPR auto next_arg_id() -> int {
-    int id = base::next_arg_id();
-    if (id >= num_args_) this->on_error("argument not found");
-    return id;
-  }
-
-  FMT_CONSTEXPR void check_arg_id(int id) {
-    base::check_arg_id(id);
-    if (id >= num_args_) this->on_error("argument not found");
-  }
-  using base::check_arg_id;
-
-  FMT_CONSTEXPR void check_dynamic_spec(int arg_id) {
-    detail::ignore_unused(arg_id);
-#if !defined(__LCC__)
-    if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id]))
-      this->on_error("width/precision is not integer");
-#endif
-  }
-};
-FMT_END_DETAIL_NAMESPACE
-
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR void
-basic_format_parse_context<Char, ErrorHandler>::do_check_arg_id(int id) {
-  // Argument id is only checked at compile-time during parsing because
-  // formatting has its own validation.
-  if (detail::is_constant_evaluated() && FMT_GCC_VERSION >= 1200) {
-    using context = detail::compile_parse_context<Char, ErrorHandler>;
-    if (id >= static_cast<context*>(this)->num_args())
-      on_error("argument not found");
-  }
-}
-
-template <typename Char, typename ErrorHandler>
-FMT_CONSTEXPR void
-basic_format_parse_context<Char, ErrorHandler>::check_dynamic_spec(int arg_id) {
-  if (detail::is_constant_evaluated()) {
-    using context = detail::compile_parse_context<Char, ErrorHandler>;
-    static_cast<context*>(this)->check_dynamic_spec(arg_id);
-  }
-}
-
-template <typename Context> class basic_format_arg;
-template <typename Context> class basic_format_args;
-template <typename Context> class dynamic_format_arg_store;
-
-// A formatter for objects of type T.
-template <typename T, typename Char = char, typename Enable = void>
-struct formatter {
-  // A deleted default constructor indicates a disabled formatter.
-  formatter() = delete;
-};
-
-// Specifies if T has an enabled formatter specialization. A type can be
-// formattable even if it doesn't have a formatter e.g. via a conversion.
-template <typename T, typename Context>
-using has_formatter =
-    std::is_constructible<typename Context::template formatter_type<T>>;
-
-// Checks whether T is a container with contiguous storage.
-template <typename T> struct is_contiguous : std::false_type {};
-template <typename Char>
-struct is_contiguous<std::basic_string<Char>> : std::true_type {};
-
-class appender;
-
-FMT_BEGIN_DETAIL_NAMESPACE
-
-template <typename Context, typename T>
-constexpr auto has_const_formatter_impl(T*)
-    -> decltype(typename Context::template formatter_type<T>().format(
-                    std::declval<const T&>(), std::declval<Context&>()),
-                true) {
-  return true;
-}
-template <typename Context>
-constexpr auto has_const_formatter_impl(...) -> bool {
-  return false;
-}
-template <typename T, typename Context>
-constexpr auto has_const_formatter() -> bool {
-  return has_const_formatter_impl<Context>(static_cast<T*>(nullptr));
-}
-
-// Extracts a reference to the container from back_insert_iterator.
-template <typename Container>
-inline auto get_container(std::back_insert_iterator<Container> it)
-    -> Container& {
-  using base = std::back_insert_iterator<Container>;
-  struct accessor : base {
-    accessor(base b) : base(b) {}
-    using base::container;
-  };
-  return *accessor(it).container;
-}
-
-template <typename Char, typename InputIt, typename OutputIt>
-FMT_CONSTEXPR auto copy_str(InputIt begin, InputIt end, OutputIt out)
-    -> OutputIt {
-  while (begin != end) *out++ = static_cast<Char>(*begin++);
-  return out;
-}
-
-template <typename Char, typename T, typename U,
-          FMT_ENABLE_IF(
-              std::is_same<remove_const_t<T>, U>::value&& is_char<U>::value)>
-FMT_CONSTEXPR auto copy_str(T* begin, T* end, U* out) -> U* {
-  if (is_constant_evaluated()) return copy_str<Char, T*, U*>(begin, end, out);
-  auto size = to_unsigned(end - begin);
-  memcpy(out, begin, size * sizeof(U));
-  return out + size;
-}
-
-/**
-  \rst
-  A contiguous memory buffer with an optional growing ability. It is an internal
-  class and shouldn't be used directly, only via `~fmt::basic_memory_buffer`.
-  \endrst
- */
-template <typename T> class buffer {
- private:
-  T* ptr_;
-  size_t size_;
-  size_t capacity_;
-
- protected:
-  // Don't initialize ptr_ since it is not accessed to save a few cycles.
-  FMT_MSC_WARNING(suppress : 26495)
-  buffer(size_t sz) noexcept : size_(sz), capacity_(sz) {}
-
-  FMT_CONSTEXPR20 buffer(T* p = nullptr, size_t sz = 0, size_t cap = 0) noexcept
-      : ptr_(p), size_(sz), capacity_(cap) {}
-
-  FMT_CONSTEXPR20 ~buffer() = default;
-  buffer(buffer&&) = default;
-
-  /** Sets the buffer data and capacity. */
-  FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) noexcept {
-    ptr_ = buf_data;
-    capacity_ = buf_capacity;
-  }
-
-  /** Increases the buffer capacity to hold at least *capacity* elements. */
-  virtual FMT_CONSTEXPR20 void grow(size_t capacity) = 0;
-
- public:
-  using value_type = T;
-  using const_reference = const T&;
-
-  buffer(const buffer&) = delete;
-  void operator=(const buffer&) = delete;
-
-  FMT_INLINE auto begin() noexcept -> T* { return ptr_; }
-  FMT_INLINE auto end() noexcept -> T* { return ptr_ + size_; }
-
-  FMT_INLINE auto begin() const noexcept -> const T* { return ptr_; }
-  FMT_INLINE auto end() const noexcept -> const T* { return ptr_ + size_; }
-
-  /** Returns the size of this buffer. */
-  constexpr auto size() const noexcept -> size_t { return size_; }
-
-  /** Returns the capacity of this buffer. */
-  constexpr auto capacity() const noexcept -> size_t { return capacity_; }
-
-  /** Returns a pointer to the buffer data. */
-  FMT_CONSTEXPR auto data() noexcept -> T* { return ptr_; }
-
-  /** Returns a pointer to the buffer data. */
-  FMT_CONSTEXPR auto data() const noexcept -> const T* { return ptr_; }
-
-  /** Clears this buffer. */
-  void clear() { size_ = 0; }
-
-  // Tries resizing the buffer to contain *count* elements. If T is a POD type
-  // the new elements may not be initialized.
-  FMT_CONSTEXPR20 void try_resize(size_t count) {
-    try_reserve(count);
-    size_ = count <= capacity_ ? count : capacity_;
-  }
-
-  // Tries increasing the buffer capacity to *new_capacity*. It can increase the
-  // capacity by a smaller amount than requested but guarantees there is space
-  // for at least one additional element either by increasing the capacity or by
-  // flushing the buffer if it is full.
-  FMT_CONSTEXPR20 void try_reserve(size_t new_capacity) {
-    if (new_capacity > capacity_) grow(new_capacity);
-  }
-
-  FMT_CONSTEXPR20 void push_back(const T& value) {
-    try_reserve(size_ + 1);
-    ptr_[size_++] = value;
-  }
-
-  /** Appends data to the end of the buffer. */
-  template <typename U> void append(const U* begin, const U* end);
-
-  template <typename Idx> FMT_CONSTEXPR auto operator[](Idx index) -> T& {
-    return ptr_[index];
-  }
-  template <typename Idx>
-  FMT_CONSTEXPR auto operator[](Idx index) const -> const T& {
-    return ptr_[index];
-  }
-};
-
-struct buffer_traits {
-  explicit buffer_traits(size_t) {}
-  auto count() const -> size_t { return 0; }
-  auto limit(size_t size) -> size_t { return size; }
-};
-
-class fixed_buffer_traits {
- private:
-  size_t count_ = 0;
-  size_t limit_;
-
- public:
-  explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
-  auto count() const -> size_t { return count_; }
-  auto limit(size_t size) -> size_t {
-    size_t n = limit_ > count_ ? limit_ - count_ : 0;
-    count_ += size;
-    return size < n ? size : n;
-  }
-};
-
-// A buffer that writes to an output iterator when flushed.
-template <typename OutputIt, typename T, typename Traits = buffer_traits>
-class iterator_buffer final : public Traits, public buffer<T> {
- private:
-  OutputIt out_;
-  enum { buffer_size = 256 };
-  T data_[buffer_size];
-
- protected:
-  FMT_CONSTEXPR20 void grow(size_t) override {
-    if (this->size() == buffer_size) flush();
-  }
-
-  void flush() {
-    auto size = this->size();
-    this->clear();
-    out_ = copy_str<T>(data_, data_ + this->limit(size), out_);
-  }
-
- public:
-  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
-      : Traits(n), buffer<T>(data_, 0, buffer_size), out_(out) {}
-  iterator_buffer(iterator_buffer&& other)
-      : Traits(other), buffer<T>(data_, 0, buffer_size), out_(other.out_) {}
-  ~iterator_buffer() { flush(); }
-
-  auto out() -> OutputIt {
-    flush();
-    return out_;
-  }
-  auto count() const -> size_t { return Traits::count() + this->size(); }
-};
-
-template <typename T>
-class iterator_buffer<T*, T, fixed_buffer_traits> final
-    : public fixed_buffer_traits,
-      public buffer<T> {
- private:
-  T* out_;
-  enum { buffer_size = 256 };
-  T data_[buffer_size];
-
- protected:
-  FMT_CONSTEXPR20 void grow(size_t) override {
-    if (this->size() == this->capacity()) flush();
-  }
-
-  void flush() {
-    size_t n = this->limit(this->size());
-    if (this->data() == out_) {
-      out_ += n;
-      this->set(data_, buffer_size);
-    }
-    this->clear();
-  }
-
- public:
-  explicit iterator_buffer(T* out, size_t n = buffer_size)
-      : fixed_buffer_traits(n), buffer<T>(out, 0, n), out_(out) {}
-  iterator_buffer(iterator_buffer&& other)
-      : fixed_buffer_traits(other),
-        buffer<T>(std::move(other)),
-        out_(other.out_) {
-    if (this->data() != out_) {
-      this->set(data_, buffer_size);
-      this->clear();
-    }
-  }
-  ~iterator_buffer() { flush(); }
-
-  auto out() -> T* {
-    flush();
-    return out_;
-  }
-  auto count() const -> size_t {
-    return fixed_buffer_traits::count() + this->size();
-  }
-};
-
-template <typename T> class iterator_buffer<T*, T> final : public buffer<T> {
- protected:
-  FMT_CONSTEXPR20 void grow(size_t) override {}
-
- public:
-  explicit iterator_buffer(T* out, size_t = 0) : buffer<T>(out, 0, ~size_t()) {}
-
-  auto out() -> T* { return &*this->end(); }
-};
-
-// A buffer that writes to a container with the contiguous storage.
-template <typename Container>
-class iterator_buffer<std::back_insert_iterator<Container>,
-                      enable_if_t<is_contiguous<Container>::value,
-                                  typename Container::value_type>>
-    final : public buffer<typename Container::value_type> {
- private:
-  Container& container_;
-
- protected:
-  FMT_CONSTEXPR20 void grow(size_t capacity) override {
-    container_.resize(capacity);
-    this->set(&container_[0], capacity);
-  }
-
- public:
-  explicit iterator_buffer(Container& c)
-      : buffer<typename Container::value_type>(c.size()), container_(c) {}
-  explicit iterator_buffer(std::back_insert_iterator<Container> out, size_t = 0)
-      : iterator_buffer(get_container(out)) {}
-
-  auto out() -> std::back_insert_iterator<Container> {
-    return std::back_inserter(container_);
-  }
-};
-
-// A buffer that counts the number of code units written discarding the output.
-template <typename T = char> class counting_buffer final : public buffer<T> {
- private:
-  enum { buffer_size = 256 };
-  T data_[buffer_size];
-  size_t count_ = 0;
-
- protected:
-  FMT_CONSTEXPR20 void grow(size_t) override {
-    if (this->size() != buffer_size) return;
-    count_ += this->size();
-    this->clear();
-  }
-
- public:
-  counting_buffer() : buffer<T>(data_, 0, buffer_size) {}
-
-  auto count() -> size_t { return count_ + this->size(); }
-};
-
-template <typename T>
-using buffer_appender = conditional_t<std::is_same<T, char>::value, appender,
-                                      std::back_insert_iterator<buffer<T>>>;
-
-// Maps an output iterator to a buffer.
-template <typename T, typename OutputIt>
-auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
-  return iterator_buffer<OutputIt, T>(out);
-}
-template <typename T, typename Buf,
-          FMT_ENABLE_IF(std::is_base_of<buffer<char>, Buf>::value)>
-auto get_buffer(std::back_insert_iterator<Buf> out) -> buffer<char>& {
-  return get_container(out);
-}
-
-template <typename Buf, typename OutputIt>
-FMT_INLINE auto get_iterator(Buf& buf, OutputIt) -> decltype(buf.out()) {
-  return buf.out();
-}
-template <typename T, typename OutputIt>
-auto get_iterator(buffer<T>&, OutputIt out) -> OutputIt {
-  return out;
-}
-
-template <typename T, typename Char = char, typename Enable = void>
-struct fallback_formatter {
-  fallback_formatter() = delete;
-};
-
-// Specifies if T has an enabled fallback_formatter specialization.
-template <typename T, typename Char>
-using has_fallback_formatter =
-#ifdef FMT_DEPRECATED_OSTREAM
-    std::is_constructible<fallback_formatter<T, Char>>;
-#else
-    std::false_type;
-#endif
-
-struct view {};
-
-template <typename Char, typename T> struct named_arg : view {
-  const Char* name;
-  const T& value;
-  named_arg(const Char* n, const T& v) : name(n), value(v) {}
-};
-
-template <typename Char> struct named_arg_info {
-  const Char* name;
-  int id;
-};
-
-template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
-struct arg_data {
-  // args_[0].named_args points to named_args_ to avoid bloating format_args.
-  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
-  T args_[1 + (NUM_ARGS != 0 ? NUM_ARGS : +1)];
-  named_arg_info<Char> named_args_[NUM_NAMED_ARGS];
-
-  template <typename... U>
-  arg_data(const U&... init) : args_{T(named_args_, NUM_NAMED_ARGS), init...} {}
-  arg_data(const arg_data& other) = delete;
-  auto args() const -> const T* { return args_ + 1; }
-  auto named_args() -> named_arg_info<Char>* { return named_args_; }
-};
-
-template <typename T, typename Char, size_t NUM_ARGS>
-struct arg_data<T, Char, NUM_ARGS, 0> {
-  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
-  T args_[NUM_ARGS != 0 ? NUM_ARGS : +1];
-
-  template <typename... U>
-  FMT_CONSTEXPR FMT_INLINE arg_data(const U&... init) : args_{init...} {}
-  FMT_CONSTEXPR FMT_INLINE auto args() const -> const T* { return args_; }
-  FMT_CONSTEXPR FMT_INLINE auto named_args() -> std::nullptr_t {
-    return nullptr;
-  }
-};
-
-template <typename Char>
-inline void init_named_args(named_arg_info<Char>*, int, int) {}
-
-template <typename T> struct is_named_arg : std::false_type {};
-template <typename T> struct is_statically_named_arg : std::false_type {};
-
-template <typename T, typename Char>
-struct is_named_arg<named_arg<Char, T>> : std::true_type {};
-
-template <typename Char, typename T, typename... Tail,
-          FMT_ENABLE_IF(!is_named_arg<T>::value)>
-void init_named_args(named_arg_info<Char>* named_args, int arg_count,
-                     int named_arg_count, const T&, const Tail&... args) {
-  init_named_args(named_args, arg_count + 1, named_arg_count, args...);
-}
-
-template <typename Char, typename T, typename... Tail,
-          FMT_ENABLE_IF(is_named_arg<T>::value)>
-void init_named_args(named_arg_info<Char>* named_args, int arg_count,
-                     int named_arg_count, const T& arg, const Tail&... args) {
-  named_args[named_arg_count++] = {arg.name, arg_count};
-  init_named_args(named_args, arg_count + 1, named_arg_count, args...);
-}
-
-template <typename... Args>
-FMT_CONSTEXPR FMT_INLINE void init_named_args(std::nullptr_t, int, int,
-                                              const Args&...) {}
-
-template <bool B = false> constexpr auto count() -> size_t { return B ? 1 : 0; }
-template <bool B1, bool B2, bool... Tail> constexpr auto count() -> size_t {
-  return (B1 ? 1 : 0) + count<B2, Tail...>();
-}
-
-template <typename... Args> constexpr auto count_named_args() -> size_t {
-  return count<is_named_arg<Args>::value...>();
-}
-
-template <typename... Args>
-constexpr auto count_statically_named_args() -> size_t {
-  return count<is_statically_named_arg<Args>::value...>();
-}
-
-struct unformattable {};
-struct unformattable_char : unformattable {};
-struct unformattable_const : unformattable {};
-struct unformattable_pointer : unformattable {};
-
-template <typename Char> struct string_value {
-  const Char* data;
-  size_t size;
-};
-
-template <typename Char> struct named_arg_value {
-  const named_arg_info<Char>* data;
-  size_t size;
-};
-
-template <typename Context> struct custom_value {
-  using parse_context = typename Context::parse_context_type;
-  void* value;
-  void (*format)(void* arg, parse_context& parse_ctx, Context& ctx);
-};
-
-// A formatting argument value.
-template <typename Context> class value {
- public:
-  using char_type = typename Context::char_type;
-
-  union {
-    monostate no_value;
-    int int_value;
-    unsigned uint_value;
-    long long long_long_value;
-    unsigned long long ulong_long_value;
-    int128_opt int128_value;
-    uint128_opt uint128_value;
-    bool bool_value;
-    char_type char_value;
-    float float_value;
-    double double_value;
-    long double long_double_value;
-    const void* pointer;
-    string_value<char_type> string;
-    custom_value<Context> custom;
-    named_arg_value<char_type> named_args;
-  };
-
-  constexpr FMT_INLINE value() : no_value() {}
-  constexpr FMT_INLINE value(int val) : int_value(val) {}
-  constexpr FMT_INLINE value(unsigned val) : uint_value(val) {}
-  constexpr FMT_INLINE value(long long val) : long_long_value(val) {}
-  constexpr FMT_INLINE value(unsigned long long val) : ulong_long_value(val) {}
-  FMT_INLINE value(int128_opt val) : int128_value(val) {}
-  FMT_INLINE value(uint128_opt val) : uint128_value(val) {}
-  constexpr FMT_INLINE value(float val) : float_value(val) {}
-  constexpr FMT_INLINE value(double val) : double_value(val) {}
-  FMT_INLINE value(long double val) : long_double_value(val) {}
-  constexpr FMT_INLINE value(bool val) : bool_value(val) {}
-  constexpr FMT_INLINE value(char_type val) : char_value(val) {}
-  FMT_CONSTEXPR FMT_INLINE value(const char_type* val) {
-    string.data = val;
-    if (is_constant_evaluated()) string.size = {};
-  }
-  FMT_CONSTEXPR FMT_INLINE value(basic_string_view<char_type> val) {
-    string.data = val.data();
-    string.size = val.size();
-  }
-  FMT_INLINE value(const void* val) : pointer(val) {}
-  FMT_INLINE value(const named_arg_info<char_type>* args, size_t size)
-      : named_args{args, size} {}
-
-  template <typename T> FMT_CONSTEXPR FMT_INLINE value(T& val) {
-    using value_type = remove_cvref_t<T>;
-    custom.value = const_cast<value_type*>(&val);
-    // Get the formatter type through the context to allow different contexts
-    // have different extension points, e.g. `formatter<T>` for `format` and
-    // `printf_formatter<T>` for `printf`.
-    custom.format = format_custom_arg<
-        value_type,
-        conditional_t<has_formatter<value_type, Context>::value,
-                      typename Context::template formatter_type<value_type>,
-                      fallback_formatter<value_type, char_type>>>;
-  }
-  value(unformattable);
-  value(unformattable_char);
-  value(unformattable_const);
-  value(unformattable_pointer);
-
- private:
-  // Formats an argument of a custom type, such as a user-defined class.
-  template <typename T, typename Formatter>
-  static void format_custom_arg(void* arg,
-                                typename Context::parse_context_type& parse_ctx,
-                                Context& ctx) {
-    auto f = Formatter();
-    parse_ctx.advance_to(f.parse(parse_ctx));
-    using qualified_type =
-        conditional_t<has_const_formatter<T, Context>(), const T, T>;
-    ctx.advance_to(f.format(*static_cast<qualified_type*>(arg), ctx));
-  }
-};
-
-template <typename Context, typename T>
-FMT_CONSTEXPR auto make_arg(T&& value) -> basic_format_arg<Context>;
-
-// To minimize the number of types we need to deal with, long is translated
-// either to int or to long long depending on its size.
-enum { long_short = sizeof(long) == sizeof(int) };
-using long_type = conditional_t<long_short, int, long long>;
-using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
-
-#ifdef __cpp_lib_byte
-inline auto format_as(std::byte b) -> unsigned char {
-  return static_cast<unsigned char>(b);
-}
-#endif
-
-template <typename T> struct has_format_as {
-  template <typename U, typename V = decltype(format_as(U())),
-            FMT_ENABLE_IF(std::is_enum<U>::value&& std::is_integral<V>::value)>
-  static auto check(U*) -> std::true_type;
-  static auto check(...) -> std::false_type;
-
-  enum { value = decltype(check(static_cast<T*>(nullptr)))::value };
-};
-
-// Maps formatting arguments to core types.
-// arg_mapper reports errors by returning unformattable instead of using
-// static_assert because it's used in the is_formattable trait.
-template <typename Context> struct arg_mapper {
-  using char_type = typename Context::char_type;
-
-  FMT_CONSTEXPR FMT_INLINE auto map(signed char val) -> int { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(unsigned char val) -> unsigned {
-    return val;
-  }
-  FMT_CONSTEXPR FMT_INLINE auto map(short val) -> int { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(unsigned short val) -> unsigned {
-    return val;
-  }
-  FMT_CONSTEXPR FMT_INLINE auto map(int val) -> int { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(unsigned val) -> unsigned { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(long val) -> long_type { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(unsigned long val) -> ulong_type {
-    return val;
-  }
-  FMT_CONSTEXPR FMT_INLINE auto map(long long val) -> long long { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(unsigned long long val)
-      -> unsigned long long {
-    return val;
-  }
-  FMT_CONSTEXPR FMT_INLINE auto map(int128_opt val) -> int128_opt {
-    return val;
-  }
-  FMT_CONSTEXPR FMT_INLINE auto map(uint128_opt val) -> uint128_opt {
-    return val;
-  }
-  FMT_CONSTEXPR FMT_INLINE auto map(bool val) -> bool { return val; }
-
-  template <typename T, FMT_ENABLE_IF(std::is_same<T, char>::value ||
-                                      std::is_same<T, char_type>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(T val) -> char_type {
-    return val;
-  }
-  template <typename T, enable_if_t<(std::is_same<T, wchar_t>::value ||
-#ifdef __cpp_char8_t
-                                     std::is_same<T, char8_t>::value ||
-#endif
-                                     std::is_same<T, char16_t>::value ||
-                                     std::is_same<T, char32_t>::value) &&
-                                        !std::is_same<T, char_type>::value,
-                                    int> = 0>
-  FMT_CONSTEXPR FMT_INLINE auto map(T) -> unformattable_char {
-    return {};
-  }
-
-  FMT_CONSTEXPR FMT_INLINE auto map(float val) -> float { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(double val) -> double { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(long double val) -> long double {
-    return val;
-  }
-
-  FMT_CONSTEXPR FMT_INLINE auto map(char_type* val) -> const char_type* {
-    return val;
-  }
-  FMT_CONSTEXPR FMT_INLINE auto map(const char_type* val) -> const char_type* {
-    return val;
-  }
-  template <typename T,
-            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value &&
-                          std::is_same<char_type, char_t<T>>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
-      -> basic_string_view<char_type> {
-    return to_string_view(val);
-  }
-  template <typename T,
-            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value &&
-                          !std::is_same<char_type, char_t<T>>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T&) -> unformattable_char {
-    return {};
-  }
-  template <typename T,
-            FMT_ENABLE_IF(
-                std::is_convertible<T, basic_string_view<char_type>>::value &&
-                !is_string<T>::value && !has_formatter<T, Context>::value &&
-                !has_fallback_formatter<T, char_type>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
-      -> basic_string_view<char_type> {
-    return basic_string_view<char_type>(val);
-  }
-  template <typename T,
-            FMT_ENABLE_IF(
-                std::is_convertible<T, std_string_view<char_type>>::value &&
-                !std::is_convertible<T, basic_string_view<char_type>>::value &&
-                !is_string<T>::value && !has_formatter<T, Context>::value &&
-                !has_fallback_formatter<T, char_type>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
-      -> basic_string_view<char_type> {
-    return std_string_view<char_type>(val);
-  }
-
-  FMT_CONSTEXPR FMT_INLINE auto map(void* val) -> const void* { return val; }
-  FMT_CONSTEXPR FMT_INLINE auto map(const void* val) -> const void* {
-    return val;
-  }
-  FMT_CONSTEXPR FMT_INLINE auto map(std::nullptr_t val) -> const void* {
-    return val;
-  }
-
-  // We use SFINAE instead of a const T* parameter to avoid conflicting with
-  // the C array overload.
-  template <
-      typename T,
-      FMT_ENABLE_IF(
-          std::is_pointer<T>::value || std::is_member_pointer<T>::value ||
-          std::is_function<typename std::remove_pointer<T>::type>::value ||
-          (std::is_convertible<const T&, const void*>::value &&
-           !std::is_convertible<const T&, const char_type*>::value &&
-           !has_formatter<T, Context>::value))>
-  FMT_CONSTEXPR auto map(const T&) -> unformattable_pointer {
-    return {};
-  }
-
-  template <typename T, std::size_t N,
-            FMT_ENABLE_IF(!std::is_same<T, wchar_t>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T (&values)[N]) -> const T (&)[N] {
-    return values;
-  }
-
-  template <typename T,
-            FMT_ENABLE_IF(
-                std::is_enum<T>::value&& std::is_convertible<T, int>::value &&
-                !has_format_as<T>::value && !has_formatter<T, Context>::value &&
-                !has_fallback_formatter<T, char_type>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
-      -> decltype(std::declval<arg_mapper>().map(
-          static_cast<underlying_t<T>>(val))) {
-    return map(static_cast<underlying_t<T>>(val));
-  }
-
-  template <typename T, FMT_ENABLE_IF(has_format_as<T>::value &&
-                                      !has_formatter<T, Context>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
-      -> decltype(std::declval<arg_mapper>().map(format_as(T()))) {
-    return map(format_as(val));
-  }
-
-  template <typename T, typename U = remove_cvref_t<T>>
-  struct formattable
-      : bool_constant<has_const_formatter<U, Context>() ||
-                      !std::is_const<remove_reference_t<T>>::value ||
-                      has_fallback_formatter<U, char_type>::value> {};
-
-#if (FMT_MSC_VERSION != 0 && FMT_MSC_VERSION < 1910) || \
-    FMT_ICC_VERSION != 0 || defined(__NVCC__)
-  // Workaround a bug in MSVC and Intel (Issue 2746).
-  template <typename T> FMT_CONSTEXPR FMT_INLINE auto do_map(T&& val) -> T& {
-    return val;
-  }
-#else
-  template <typename T, FMT_ENABLE_IF(formattable<T>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto do_map(T&& val) -> T& {
-    return val;
-  }
-  template <typename T, FMT_ENABLE_IF(!formattable<T>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto do_map(T&&) -> unformattable_const {
-    return {};
-  }
-#endif
-
-  template <typename T, typename U = remove_cvref_t<T>,
-            FMT_ENABLE_IF(!is_string<U>::value && !is_char<U>::value &&
-                          !std::is_array<U>::value &&
-                          !std::is_pointer<U>::value &&
-                          !has_format_as<U>::value &&
-                          (has_formatter<U, Context>::value ||
-                           has_fallback_formatter<U, char_type>::value))>
-  FMT_CONSTEXPR FMT_INLINE auto map(T&& val)
-      -> decltype(this->do_map(std::forward<T>(val))) {
-    return do_map(std::forward<T>(val));
-  }
-
-  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
-  FMT_CONSTEXPR FMT_INLINE auto map(const T& named_arg)
-      -> decltype(std::declval<arg_mapper>().map(named_arg.value)) {
-    return map(named_arg.value);
-  }
-
-  auto map(...) -> unformattable { return {}; }
-};
-
-// A type constant after applying arg_mapper<Context>.
-template <typename T, typename Context>
-using mapped_type_constant =
-    type_constant<decltype(arg_mapper<Context>().map(std::declval<const T&>())),
-                  typename Context::char_type>;
-
-enum { packed_arg_bits = 4 };
-// Maximum number of arguments with packed types.
-enum { max_packed_args = 62 / packed_arg_bits };
-enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
-enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
-
-FMT_END_DETAIL_NAMESPACE
-
-// An output iterator that appends to a buffer.
-// It is used to reduce symbol sizes for the common case.
-class appender : public std::back_insert_iterator<detail::buffer<char>> {
-  using base = std::back_insert_iterator<detail::buffer<char>>;
-
- public:
-  using std::back_insert_iterator<detail::buffer<char>>::back_insert_iterator;
-  appender(base it) noexcept : base(it) {}
-  FMT_UNCHECKED_ITERATOR(appender);
-
-  auto operator++() noexcept -> appender& { return *this; }
-  auto operator++(int) noexcept -> appender { return *this; }
-};
-
-// A formatting argument. It is a trivially copyable/constructible type to
-// allow storage in basic_memory_buffer.
-template <typename Context> class basic_format_arg {
- private:
-  detail::value<Context> value_;
-  detail::type type_;
-
-  template <typename ContextType, typename T>
-  friend FMT_CONSTEXPR auto detail::make_arg(T&& value)
-      -> basic_format_arg<ContextType>;
-
-  template <typename Visitor, typename Ctx>
-  friend FMT_CONSTEXPR auto visit_format_arg(Visitor&& vis,
-                                             const basic_format_arg<Ctx>& arg)
-      -> decltype(vis(0));
-
-  friend class basic_format_args<Context>;
-  friend class dynamic_format_arg_store<Context>;
-
-  using char_type = typename Context::char_type;
-
-  template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
-  friend struct detail::arg_data;
-
-  basic_format_arg(const detail::named_arg_info<char_type>* args, size_t size)
-      : value_(args, size) {}
-
- public:
-  class handle {
-   public:
-    explicit handle(detail::custom_value<Context> custom) : custom_(custom) {}
-
-    void format(typename Context::parse_context_type& parse_ctx,
-                Context& ctx) const {
-      custom_.format(custom_.value, parse_ctx, ctx);
-    }
-
-   private:
-    detail::custom_value<Context> custom_;
-  };
-
-  constexpr basic_format_arg() : type_(detail::type::none_type) {}
-
-  constexpr explicit operator bool() const noexcept {
-    return type_ != detail::type::none_type;
-  }
-
-  auto type() const -> detail::type { return type_; }
-
-  auto is_integral() const -> bool { return detail::is_integral_type(type_); }
-  auto is_arithmetic() const -> bool {
-    return detail::is_arithmetic_type(type_);
-  }
-};
-
-/**
-  \rst
-  Visits an argument dispatching to the appropriate visit method based on
-  the argument type. For example, if the argument type is ``double`` then
-  ``vis(value)`` will be called with the value of type ``double``.
-  \endrst
- */
-#if FMT_ICC_VERSION != 0
-#pragma warning(disable : 1595)
-#endif
-template <typename Visitor, typename Context>
-FMT_CONSTEXPR FMT_INLINE auto visit_format_arg(
-    Visitor&& vis, const basic_format_arg<Context>& arg) -> decltype(vis(0)) {
-  switch (arg.type_) {
-  case detail::type::none_type:
-    break;
-  case detail::type::int_type:
-    return vis(arg.value_.int_value);
-  case detail::type::uint_type:
-    return vis(arg.value_.uint_value);
-  case detail::type::long_long_type:
-    return vis(arg.value_.long_long_value);
-  case detail::type::ulong_long_type:
-    return vis(arg.value_.ulong_long_value);
-  case detail::type::int128_type:
-    return vis(detail::convert_for_visit(arg.value_.int128_value));
-  case detail::type::uint128_type:
-    return vis(detail::convert_for_visit(arg.value_.uint128_value));
-  case detail::type::bool_type:
-    return vis(arg.value_.bool_value);
-  case detail::type::char_type:
-    return vis(arg.value_.char_value);
-  case detail::type::float_type:
-    return vis(arg.value_.float_value);
-  case detail::type::double_type:
-    return vis(arg.value_.double_value);
-  case detail::type::long_double_type:
-    return vis(arg.value_.long_double_value);
-  case detail::type::cstring_type:
-    return vis(arg.value_.string.data);
-  case detail::type::string_type:
-    using sv = basic_string_view<typename Context::char_type>;
-    return vis(sv(arg.value_.string.data, arg.value_.string.size));
-  case detail::type::pointer_type:
-    return vis(arg.value_.pointer);
-  case detail::type::custom_type:
-    return vis(typename basic_format_arg<Context>::handle(arg.value_.custom));
-  }
-  return vis(monostate());
-}
-
-FMT_BEGIN_DETAIL_NAMESPACE
-
-template <typename Char, typename InputIt>
-auto copy_str(InputIt begin, InputIt end, appender out) -> appender {
-  get_container(out).append(begin, end);
-  return out;
-}
-
-template <typename Char, typename R, typename OutputIt>
-FMT_CONSTEXPR auto copy_str(R&& rng, OutputIt out) -> OutputIt {
-  return detail::copy_str<Char>(rng.begin(), rng.end(), out);
-}
-
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
-// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
-template <typename... Ts> struct void_t_impl { using type = void; };
-template <typename... Ts>
-using void_t = typename detail::void_t_impl<Ts...>::type;
-#else
-template <typename...> using void_t = void;
-#endif
-
-template <typename It, typename T, typename Enable = void>
-struct is_output_iterator : std::false_type {};
-
-template <typename It, typename T>
-struct is_output_iterator<
-    It, T,
-    void_t<typename std::iterator_traits<It>::iterator_category,
-           decltype(*std::declval<It>() = std::declval<T>())>>
-    : std::true_type {};
-
-template <typename OutputIt>
-struct is_back_insert_iterator : std::false_type {};
-template <typename Container>
-struct is_back_insert_iterator<std::back_insert_iterator<Container>>
-    : std::true_type {};
-
-template <typename OutputIt>
-struct is_contiguous_back_insert_iterator : std::false_type {};
-template <typename Container>
-struct is_contiguous_back_insert_iterator<std::back_insert_iterator<Container>>
-    : is_contiguous<Container> {};
-template <>
-struct is_contiguous_back_insert_iterator<appender> : std::true_type {};
-
-// A type-erased reference to an std::locale to avoid a heavy <locale> include.
-class locale_ref {
- private:
-  const void* locale_;  // A type-erased pointer to std::locale.
-
- public:
-  constexpr FMT_INLINE locale_ref() : locale_(nullptr) {}
-  template <typename Locale> explicit locale_ref(const Locale& loc);
-
-  explicit operator bool() const noexcept { return locale_ != nullptr; }
-
-  template <typename Locale> auto get() const -> Locale;
-};
-
-template <typename> constexpr auto encode_types() -> unsigned long long {
-  return 0;
-}
-
-template <typename Context, typename Arg, typename... Args>
-constexpr auto encode_types() -> unsigned long long {
-  return static_cast<unsigned>(mapped_type_constant<Arg, Context>::value) |
-         (encode_types<Context, Args...>() << packed_arg_bits);
-}
-
-template <typename Context, typename T>
-FMT_CONSTEXPR FMT_INLINE auto make_value(T&& val) -> value<Context> {
-  const auto& arg = arg_mapper<Context>().map(FMT_FORWARD(val));
-
-  constexpr bool formattable_char =
-      !std::is_same<decltype(arg), const unformattable_char&>::value;
-  static_assert(formattable_char, "Mixing character types is disallowed.");
-
-  constexpr bool formattable_const =
-      !std::is_same<decltype(arg), const unformattable_const&>::value;
-  static_assert(formattable_const, "Cannot format a const argument.");
-
-  // Formatting of arbitrary pointers is disallowed. If you want to output
-  // a pointer cast it to "void *" or "const void *". In particular, this
-  // forbids formatting of "[const] volatile char *" which is printed as bool
-  // by iostreams.
-  constexpr bool formattable_pointer =
-      !std::is_same<decltype(arg), const unformattable_pointer&>::value;
-  static_assert(formattable_pointer,
-                "Formatting of non-void pointers is disallowed.");
-
-  constexpr bool formattable =
-      !std::is_same<decltype(arg), const unformattable&>::value;
-  static_assert(
-      formattable,
-      "Cannot format an argument. To make type T formattable provide a "
-      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
-  return {arg};
-}
-
-template <typename Context, typename T>
-FMT_CONSTEXPR auto make_arg(T&& value) -> basic_format_arg<Context> {
-  basic_format_arg<Context> arg;
-  arg.type_ = mapped_type_constant<T, Context>::value;
-  arg.value_ = make_value<Context>(value);
-  return arg;
-}
-
-// The type template parameter is there to avoid an ODR violation when using
-// a fallback formatter in one translation unit and an implicit conversion in
-// another (not recommended).
-template <bool IS_PACKED, typename Context, type, typename T,
-          FMT_ENABLE_IF(IS_PACKED)>
-FMT_CONSTEXPR FMT_INLINE auto make_arg(T&& val) -> value<Context> {
-  return make_value<Context>(val);
-}
-
-template <bool IS_PACKED, typename Context, type, typename T,
-          FMT_ENABLE_IF(!IS_PACKED)>
-FMT_CONSTEXPR inline auto make_arg(T&& value) -> basic_format_arg<Context> {
-  return make_arg<Context>(value);
-}
-FMT_END_DETAIL_NAMESPACE
-
-// Formatting context.
-template <typename OutputIt, typename Char> class basic_format_context {
- public:
-  /** The character type for the output. */
-  using char_type = Char;
-
- private:
-  OutputIt out_;
-  basic_format_args<basic_format_context> args_;
-  detail::locale_ref loc_;
-
- public:
-  using iterator = OutputIt;
-  using format_arg = basic_format_arg<basic_format_context>;
-  using parse_context_type = basic_format_parse_context<Char>;
-  template <typename T> using formatter_type = formatter<T, char_type>;
-
-  basic_format_context(basic_format_context&&) = default;
-  basic_format_context(const basic_format_context&) = delete;
-  void operator=(const basic_format_context&) = delete;
-  /**
-   Constructs a ``basic_format_context`` object. References to the arguments are
-   stored in the object so make sure they have appropriate lifetimes.
-   */
-  constexpr basic_format_context(
-      OutputIt out, basic_format_args<basic_format_context> ctx_args,
-      detail::locale_ref loc = detail::locale_ref())
-      : out_(out), args_(ctx_args), loc_(loc) {}
-
-  constexpr auto arg(int id) const -> format_arg { return args_.get(id); }
-  FMT_CONSTEXPR auto arg(basic_string_view<char_type> name) -> format_arg {
-    return args_.get(name);
-  }
-  FMT_CONSTEXPR auto arg_id(basic_string_view<char_type> name) -> int {
-    return args_.get_id(name);
-  }
-  auto args() const -> const basic_format_args<basic_format_context>& {
-    return args_;
-  }
-
-  FMT_CONSTEXPR auto error_handler() -> detail::error_handler { return {}; }
-  void on_error(const char* message) { error_handler().on_error(message); }
-
-  // Returns an iterator to the beginning of the output range.
-  FMT_CONSTEXPR auto out() -> iterator { return out_; }
-
-  // Advances the begin iterator to ``it``.
-  void advance_to(iterator it) {
-    if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
-  }
-
-  FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
-};
-
-template <typename Char>
-using buffer_context =
-    basic_format_context<detail::buffer_appender<Char>, Char>;
-using format_context = buffer_context<char>;
-
-// Workaround an alias issue: https://stackoverflow.com/q/62767544/471164.
-#define FMT_BUFFER_CONTEXT(Char) \
-  basic_format_context<detail::buffer_appender<Char>, Char>
-
-template <typename T, typename Char = char>
-using is_formattable = bool_constant<
-    !std::is_base_of<detail::unformattable,
-                     decltype(detail::arg_mapper<buffer_context<Char>>().map(
-                         std::declval<T>()))>::value &&
-    !detail::has_fallback_formatter<T, Char>::value>;
-
-/**
-  \rst
-  An array of references to arguments. It can be implicitly converted into
-  `~fmt::basic_format_args` for passing into type-erased formatting functions
-  such as `~fmt::vformat`.
-  \endrst
- */
-template <typename Context, typename... Args>
-class format_arg_store
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-    // Workaround a GCC template argument substitution bug.
-    : public basic_format_args<Context>
-#endif
-{
- private:
-  static const size_t num_args = sizeof...(Args);
-  static const size_t num_named_args = detail::count_named_args<Args...>();
-  static const bool is_packed = num_args <= detail::max_packed_args;
-
-  using value_type = conditional_t<is_packed, detail::value<Context>,
-                                   basic_format_arg<Context>>;
-
-  detail::arg_data<value_type, typename Context::char_type, num_args,
-                   num_named_args>
-      data_;
-
-  friend class basic_format_args<Context>;
-
-  static constexpr unsigned long long desc =
-      (is_packed ? detail::encode_types<Context, Args...>()
-                 : detail::is_unpacked_bit | num_args) |
-      (num_named_args != 0
-           ? static_cast<unsigned long long>(detail::has_named_args_bit)
-           : 0);
-
- public:
-  template <typename... T>
-  FMT_CONSTEXPR FMT_INLINE format_arg_store(T&&... args)
-      :
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-        basic_format_args<Context>(*this),
-#endif
-        data_{detail::make_arg<
-            is_packed, Context,
-            detail::mapped_type_constant<remove_cvref_t<T>, Context>::value>(
-            FMT_FORWARD(args))...} {
-    detail::init_named_args(data_.named_args(), 0, 0, args...);
-  }
-};
-
-/**
-  \rst
-  Constructs a `~fmt::format_arg_store` object that contains references to
-  arguments and can be implicitly converted to `~fmt::format_args`. `Context`
-  can be omitted in which case it defaults to `~fmt::context`.
-  See `~fmt::arg` for lifetime considerations.
-  \endrst
- */
-template <typename Context = format_context, typename... Args>
-constexpr auto make_format_args(Args&&... args)
-    -> format_arg_store<Context, remove_cvref_t<Args>...> {
-  return {FMT_FORWARD(args)...};
-}
-
-/**
-  \rst
-  Returns a named argument to be used in a formatting function.
-  It should only be used in a call to a formatting function or
-  `dynamic_format_arg_store::push_back`.
-
-  **Example**::
-
-    fmt::print("Elapsed time: {s:.2f} seconds", fmt::arg("s", 1.23));
-  \endrst
- */
-template <typename Char, typename T>
-inline auto arg(const Char* name, const T& arg) -> detail::named_arg<Char, T> {
-  static_assert(!detail::is_named_arg<T>(), "nested named arguments");
-  return {name, arg};
-}
-
-/**
-  \rst
-  A view of a collection of formatting arguments. To avoid lifetime issues it
-  should only be used as a parameter type in type-erased functions such as
-  ``vformat``::
-
-    void vlog(string_view format_str, format_args args);  // OK
-    format_args args = make_format_args(42);  // Error: dangling reference
-  \endrst
- */
-template <typename Context> class basic_format_args {
- public:
-  using size_type = int;
-  using format_arg = basic_format_arg<Context>;
-
- private:
-  // A descriptor that contains information about formatting arguments.
-  // If the number of arguments is less or equal to max_packed_args then
-  // argument types are passed in the descriptor. This reduces binary code size
-  // per formatting function call.
-  unsigned long long desc_;
-  union {
-    // If is_packed() returns true then argument values are stored in values_;
-    // otherwise they are stored in args_. This is done to improve cache
-    // locality and reduce compiled code size since storing larger objects
-    // may require more code (at least on x86-64) even if the same amount of
-    // data is actually copied to stack. It saves ~10% on the bloat test.
-    const detail::value<Context>* values_;
-    const format_arg* args_;
-  };
-
-  constexpr auto is_packed() const -> bool {
-    return (desc_ & detail::is_unpacked_bit) == 0;
-  }
-  auto has_named_args() const -> bool {
-    return (desc_ & detail::has_named_args_bit) != 0;
-  }
-
-  FMT_CONSTEXPR auto type(int index) const -> detail::type {
-    int shift = index * detail::packed_arg_bits;
-    unsigned int mask = (1 << detail::packed_arg_bits) - 1;
-    return static_cast<detail::type>((desc_ >> shift) & mask);
-  }
-
-  constexpr FMT_INLINE basic_format_args(unsigned long long desc,
-                                         const detail::value<Context>* values)
-      : desc_(desc), values_(values) {}
-  constexpr basic_format_args(unsigned long long desc, const format_arg* args)
-      : desc_(desc), args_(args) {}
-
- public:
-  constexpr basic_format_args() : desc_(0), args_(nullptr) {}
-
-  /**
-   \rst
-   Constructs a `basic_format_args` object from `~fmt::format_arg_store`.
-   \endrst
-   */
-  template <typename... Args>
-  constexpr FMT_INLINE basic_format_args(
-      const format_arg_store<Context, Args...>& store)
-      : basic_format_args(format_arg_store<Context, Args...>::desc,
-                          store.data_.args()) {}
-
-  /**
-   \rst
-   Constructs a `basic_format_args` object from
-   `~fmt::dynamic_format_arg_store`.
-   \endrst
-   */
-  constexpr FMT_INLINE basic_format_args(
-      const dynamic_format_arg_store<Context>& store)
-      : basic_format_args(store.get_types(), store.data()) {}
-
-  /**
-   \rst
-   Constructs a `basic_format_args` object from a dynamic set of arguments.
-   \endrst
-   */
-  constexpr basic_format_args(const format_arg* args, int count)
-      : basic_format_args(detail::is_unpacked_bit | detail::to_unsigned(count),
-                          args) {}
-
-  /** Returns the argument with the specified id. */
-  FMT_CONSTEXPR auto get(int id) const -> format_arg {
-    format_arg arg;
-    if (!is_packed()) {
-      if (id < max_size()) arg = args_[id];
-      return arg;
-    }
-    if (id >= detail::max_packed_args) return arg;
-    arg.type_ = type(id);
-    if (arg.type_ == detail::type::none_type) return arg;
-    arg.value_ = values_[id];
-    return arg;
-  }
-
-  template <typename Char>
-  auto get(basic_string_view<Char> name) const -> format_arg {
-    int id = get_id(name);
-    return id >= 0 ? get(id) : format_arg();
-  }
-
-  template <typename Char>
-  auto get_id(basic_string_view<Char> name) const -> int {
-    if (!has_named_args()) return -1;
-    const auto& named_args =
-        (is_packed() ? values_[-1] : args_[-1].value_).named_args;
-    for (size_t i = 0; i < named_args.size; ++i) {
-      if (named_args.data[i].name == name) return named_args.data[i].id;
-    }
-    return -1;
-  }
-
-  auto max_size() const -> int {
-    unsigned long long max_packed = detail::max_packed_args;
-    return static_cast<int>(is_packed() ? max_packed
-                                        : desc_ & ~detail::is_unpacked_bit);
-  }
-};
-
-/** An alias to ``basic_format_args<format_context>``. */
-// A separate type would result in shorter symbols but break ABI compatibility
-// between clang and gcc on ARM (#1919).
-using format_args = basic_format_args<format_context>;
-
-// We cannot use enum classes as bit fields because of a gcc bug, so we put them
-// in namespaces instead (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414).
-// Additionally, if an underlying type is specified, older gcc incorrectly warns
-// that the type is too small. Both bugs are fixed in gcc 9.3.
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 903
-#  define FMT_ENUM_UNDERLYING_TYPE(type)
-#else
-#  define FMT_ENUM_UNDERLYING_TYPE(type) : type
-#endif
-namespace align {
-enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, left, right, center,
-                                                  numeric};
-}
-using align_t = align::type;
-namespace sign {
-enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, minus, plus, space};
-}
-using sign_t = sign::type;
-
-FMT_BEGIN_DETAIL_NAMESPACE
-
-// Workaround an array initialization issue in gcc 4.8.
-template <typename Char> struct fill_t {
- private:
-  enum { max_size = 4 };
-  Char data_[max_size] = {Char(' '), Char(0), Char(0), Char(0)};
-  unsigned char size_ = 1;
-
- public:
-  FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
-    auto size = s.size();
-    if (size > max_size) return throw_format_error("invalid fill");
-    for (size_t i = 0; i < size; ++i) data_[i] = s[i];
-    size_ = static_cast<unsigned char>(size);
-  }
-
-  constexpr auto size() const -> size_t { return size_; }
-  constexpr auto data() const -> const Char* { return data_; }
-
-  FMT_CONSTEXPR auto operator[](size_t index) -> Char& { return data_[index]; }
-  FMT_CONSTEXPR auto operator[](size_t index) const -> const Char& {
-    return data_[index];
-  }
-};
-FMT_END_DETAIL_NAMESPACE
-
-enum class presentation_type : unsigned char {
-  none,
-  // Integer types should go first,
-  dec,             // 'd'
-  oct,             // 'o'
-  hex_lower,       // 'x'
-  hex_upper,       // 'X'
-  bin_lower,       // 'b'
-  bin_upper,       // 'B'
-  hexfloat_lower,  // 'a'
-  hexfloat_upper,  // 'A'
-  exp_lower,       // 'e'
-  exp_upper,       // 'E'
-  fixed_lower,     // 'f'
-  fixed_upper,     // 'F'
-  general_lower,   // 'g'
-  general_upper,   // 'G'
-  chr,             // 'c'
-  string,          // 's'
-  pointer,         // 'p'
-  debug            // '?'
-};
-
-// Format specifiers for built-in and string types.
-template <typename Char> struct basic_format_specs {
-  int width;
-  int precision;
-  presentation_type type;
-  align_t align : 4;
-  sign_t sign : 3;
-  bool alt : 1;  // Alternate form ('#').
-  bool localized : 1;
-  detail::fill_t<Char> fill;
-
-  constexpr basic_format_specs()
-      : width(0),
-        precision(-1),
-        type(presentation_type::none),
-        align(align::none),
-        sign(sign::none),
-        alt(false),
-        localized(false) {}
-};
-
-using format_specs = basic_format_specs<char>;
-
-FMT_BEGIN_DETAIL_NAMESPACE
-
-enum class arg_id_kind { none, index, name };
-
-// An argument reference.
-template <typename Char> struct arg_ref {
-  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
-
-  FMT_CONSTEXPR explicit arg_ref(int index)
-      : kind(arg_id_kind::index), val(index) {}
-  FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
-      : kind(arg_id_kind::name), val(name) {}
-
-  FMT_CONSTEXPR auto operator=(int idx) -> arg_ref& {
-    kind = arg_id_kind::index;
-    val.index = idx;
-    return *this;
-  }
-
-  arg_id_kind kind;
-  union value {
-    FMT_CONSTEXPR value(int id = 0) : index{id} {}
-    FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
-
-    int index;
-    basic_string_view<Char> name;
-  } val;
-};
-
-// Format specifiers with width and precision resolved at formatting rather
-// than parsing time to allow re-using the same parsed specifiers with
-// different sets of arguments (precompilation of format strings).
-template <typename Char>
-struct dynamic_format_specs : basic_format_specs<Char> {
-  arg_ref<Char> width_ref;
-  arg_ref<Char> precision_ref;
-};
-
-struct auto_id {};
-
-// A format specifier handler that sets fields in basic_format_specs.
-template <typename Char> class specs_setter {
- protected:
-  basic_format_specs<Char>& specs_;
-
- public:
-  explicit FMT_CONSTEXPR specs_setter(basic_format_specs<Char>& specs)
-      : specs_(specs) {}
-
-  FMT_CONSTEXPR specs_setter(const specs_setter& other)
-      : specs_(other.specs_) {}
-
-  FMT_CONSTEXPR void on_align(align_t align) { specs_.align = align; }
-  FMT_CONSTEXPR void on_fill(basic_string_view<Char> fill) {
-    specs_.fill = fill;
-  }
-  FMT_CONSTEXPR void on_sign(sign_t s) { specs_.sign = s; }
-  FMT_CONSTEXPR void on_hash() { specs_.alt = true; }
-  FMT_CONSTEXPR void on_localized() { specs_.localized = true; }
-
-  FMT_CONSTEXPR void on_zero() {
-    if (specs_.align == align::none) specs_.align = align::numeric;
-    specs_.fill[0] = Char('0');
-  }
-
-  FMT_CONSTEXPR void on_width(int width) { specs_.width = width; }
-  FMT_CONSTEXPR void on_precision(int precision) {
-    specs_.precision = precision;
-  }
-  FMT_CONSTEXPR void end_precision() {}
-
-  FMT_CONSTEXPR void on_type(presentation_type type) { specs_.type = type; }
-};
-
-// Format spec handler that saves references to arguments representing dynamic
-// width and precision to be resolved at formatting time.
-template <typename ParseContext>
-class dynamic_specs_handler
-    : public specs_setter<typename ParseContext::char_type> {
- public:
-  using char_type = typename ParseContext::char_type;
-
-  FMT_CONSTEXPR dynamic_specs_handler(dynamic_format_specs<char_type>& specs,
-                                      ParseContext& ctx)
-      : specs_setter<char_type>(specs), specs_(specs), context_(ctx) {}
-
-  FMT_CONSTEXPR dynamic_specs_handler(const dynamic_specs_handler& other)
-      : specs_setter<char_type>(other),
-        specs_(other.specs_),
-        context_(other.context_) {}
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
-    specs_.width_ref = make_arg_ref(arg_id);
-  }
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
-    specs_.precision_ref = make_arg_ref(arg_id);
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    context_.on_error(message);
-  }
-
- private:
-  dynamic_format_specs<char_type>& specs_;
-  ParseContext& context_;
-
-  using arg_ref_type = arg_ref<char_type>;
-
-  FMT_CONSTEXPR auto make_arg_ref(int arg_id) -> arg_ref_type {
-    context_.check_arg_id(arg_id);
-    context_.check_dynamic_spec(arg_id);
-    return arg_ref_type(arg_id);
-  }
-
-  FMT_CONSTEXPR auto make_arg_ref(auto_id) -> arg_ref_type {
-    int arg_id = context_.next_arg_id();
-    context_.check_dynamic_spec(arg_id);
-    return arg_ref_type(arg_id);
-  }
-
-  FMT_CONSTEXPR auto make_arg_ref(basic_string_view<char_type> arg_id)
-      -> arg_ref_type {
-    context_.check_arg_id(arg_id);
-    basic_string_view<char_type> format_str(
-        context_.begin(), to_unsigned(context_.end() - context_.begin()));
-    return arg_ref_type(arg_id);
-  }
-};
-
-template <typename Char> constexpr bool is_ascii_letter(Char c) {
-  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
-}
-
-// Converts a character to ASCII. Returns a number > 127 on conversion failure.
-template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
-constexpr auto to_ascii(Char c) -> Char {
-  return c;
-}
-template <typename Char, FMT_ENABLE_IF(std::is_enum<Char>::value)>
-constexpr auto to_ascii(Char c) -> underlying_t<Char> {
-  return c;
-}
-
-FMT_CONSTEXPR inline auto code_point_length_impl(char c) -> int {
-  return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
-      [static_cast<unsigned char>(c) >> 3];
-}
-
-template <typename Char>
-FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
-  if (const_check(sizeof(Char) != 1)) return 1;
-  int len = code_point_length_impl(static_cast<char>(*begin));
-
-  // Compute the pointer to the next character early so that the next
-  // iteration can start working on the next character. Neither Clang
-  // nor GCC figure out this reordering on their own.
-  return len + !len;
-}
-
-// Return the result via the out param to workaround gcc bug 77539.
-template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
-FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr& out) -> bool {
-  for (out = first; out != last; ++out) {
-    if (*out == value) return true;
-  }
-  return false;
-}
-
-template <>
-inline auto find<false, char>(const char* first, const char* last, char value,
-                              const char*& out) -> bool {
-  out = static_cast<const char*>(
-      std::memchr(first, value, to_unsigned(last - first)));
-  return out != nullptr;
-}
-
-// Parses the range [begin, end) as an unsigned integer. This function assumes
-// that the range is non-empty and the first character is a digit.
-template <typename Char>
-FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
-                                         int error_value) noexcept -> int {
-  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
-  unsigned value = 0, prev = 0;
-  auto p = begin;
-  do {
-    prev = value;
-    value = value * 10 + unsigned(*p - '0');
-    ++p;
-  } while (p != end && '0' <= *p && *p <= '9');
-  auto num_digits = p - begin;
-  begin = p;
-  if (num_digits <= std::numeric_limits<int>::digits10)
-    return static_cast<int>(value);
-  // Check for overflow.
-  const unsigned max = to_unsigned((std::numeric_limits<int>::max)());
-  return num_digits == std::numeric_limits<int>::digits10 + 1 &&
-                 prev * 10ull + unsigned(p[-1] - '0') <= max
-             ? static_cast<int>(value)
-             : error_value;
-}
-
-// Parses fill and alignment.
-template <typename Char, typename Handler>
-FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end,
-                               Handler&& handler) -> const Char* {
-  FMT_ASSERT(begin != end, "");
-  auto align = align::none;
-  auto p = begin + code_point_length(begin);
-  if (end - p <= 0) p = begin;
-  for (;;) {
-    switch (to_ascii(*p)) {
-    case '<':
-      align = align::left;
-      break;
-    case '>':
-      align = align::right;
-      break;
-    case '^':
-      align = align::center;
-      break;
-    default:
-      break;
-    }
-    if (align != align::none) {
-      if (p != begin) {
-        auto c = *begin;
-        if (c == '{')
-          return handler.on_error("invalid fill character '{'"), begin;
-        if (c == '}') return begin;
-        handler.on_fill(basic_string_view<Char>(begin, to_unsigned(p - begin)));
-        begin = p + 1;
-      } else
-        ++begin;
-      handler.on_align(align);
-      break;
-    } else if (p == begin) {
-      break;
-    }
-    p = begin;
-  }
-  return begin;
-}
-
-template <typename Char> FMT_CONSTEXPR bool is_name_start(Char c) {
-  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c;
-}
-
-template <typename Char, typename IDHandler>
-FMT_CONSTEXPR auto do_parse_arg_id(const Char* begin, const Char* end,
-                                   IDHandler&& handler) -> const Char* {
-  FMT_ASSERT(begin != end, "");
-  Char c = *begin;
-  if (c >= '0' && c <= '9') {
-    int index = 0;
-    if (c != '0')
-      index =
-          parse_nonnegative_int(begin, end, (std::numeric_limits<int>::max)());
-    else
-      ++begin;
-    if (begin == end || (*begin != '}' && *begin != ':'))
-      handler.on_error("invalid format string");
-    else
-      handler(index);
-    return begin;
-  }
-  if (!is_name_start(c)) {
-    handler.on_error("invalid format string");
-    return begin;
-  }
-  auto it = begin;
-  do {
-    ++it;
-  } while (it != end && (is_name_start(c = *it) || ('0' <= c && c <= '9')));
-  handler(basic_string_view<Char>(begin, to_unsigned(it - begin)));
-  return it;
-}
-
-template <typename Char, typename IDHandler>
-FMT_CONSTEXPR FMT_INLINE auto parse_arg_id(const Char* begin, const Char* end,
-                                           IDHandler&& handler) -> const Char* {
-  Char c = *begin;
-  if (c != '}' && c != ':') return do_parse_arg_id(begin, end, handler);
-  handler();
-  return begin;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR auto parse_width(const Char* begin, const Char* end,
-                               Handler&& handler) -> const Char* {
-  using detail::auto_id;
-  struct width_adapter {
-    Handler& handler;
-
-    FMT_CONSTEXPR void operator()() { handler.on_dynamic_width(auto_id()); }
-    FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_width(id); }
-    FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-      handler.on_dynamic_width(id);
-    }
-    FMT_CONSTEXPR void on_error(const char* message) {
-      if (message) handler.on_error(message);
-    }
-  };
-
-  FMT_ASSERT(begin != end, "");
-  if ('0' <= *begin && *begin <= '9') {
-    int width = parse_nonnegative_int(begin, end, -1);
-    if (width != -1)
-      handler.on_width(width);
-    else
-      handler.on_error("number is too big");
-  } else if (*begin == '{') {
-    ++begin;
-    if (begin != end) begin = parse_arg_id(begin, end, width_adapter{handler});
-    if (begin == end || *begin != '}')
-      return handler.on_error("invalid format string"), begin;
-    ++begin;
-  }
-  return begin;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end,
-                                   Handler&& handler) -> const Char* {
-  using detail::auto_id;
-  struct precision_adapter {
-    Handler& handler;
-
-    FMT_CONSTEXPR void operator()() { handler.on_dynamic_precision(auto_id()); }
-    FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_precision(id); }
-    FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-      handler.on_dynamic_precision(id);
-    }
-    FMT_CONSTEXPR void on_error(const char* message) {
-      if (message) handler.on_error(message);
-    }
-  };
-
-  ++begin;
-  auto c = begin != end ? *begin : Char();
-  if ('0' <= c && c <= '9') {
-    auto precision = parse_nonnegative_int(begin, end, -1);
-    if (precision != -1)
-      handler.on_precision(precision);
-    else
-      handler.on_error("number is too big");
-  } else if (c == '{') {
-    ++begin;
-    if (begin != end)
-      begin = parse_arg_id(begin, end, precision_adapter{handler});
-    if (begin == end || *begin++ != '}')
-      return handler.on_error("invalid format string"), begin;
-  } else {
-    return handler.on_error("missing precision specifier"), begin;
-  }
-  handler.end_precision();
-  return begin;
-}
-
-template <typename Char>
-FMT_CONSTEXPR auto parse_presentation_type(Char type) -> presentation_type {
-  switch (to_ascii(type)) {
-  case 'd':
-    return presentation_type::dec;
-  case 'o':
-    return presentation_type::oct;
-  case 'x':
-    return presentation_type::hex_lower;
-  case 'X':
-    return presentation_type::hex_upper;
-  case 'b':
-    return presentation_type::bin_lower;
-  case 'B':
-    return presentation_type::bin_upper;
-  case 'a':
-    return presentation_type::hexfloat_lower;
-  case 'A':
-    return presentation_type::hexfloat_upper;
-  case 'e':
-    return presentation_type::exp_lower;
-  case 'E':
-    return presentation_type::exp_upper;
-  case 'f':
-    return presentation_type::fixed_lower;
-  case 'F':
-    return presentation_type::fixed_upper;
-  case 'g':
-    return presentation_type::general_lower;
-  case 'G':
-    return presentation_type::general_upper;
-  case 'c':
-    return presentation_type::chr;
-  case 's':
-    return presentation_type::string;
-  case 'p':
-    return presentation_type::pointer;
-  case '?':
-    return presentation_type::debug;
-  default:
-    return presentation_type::none;
-  }
-}
-
-// Parses standard format specifiers and sends notifications about parsed
-// components to handler.
-template <typename Char, typename SpecHandler>
-FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(const Char* begin,
-                                                 const Char* end,
-                                                 SpecHandler&& handler)
-    -> const Char* {
-  if (1 < end - begin && begin[1] == '}' && is_ascii_letter(*begin) &&
-      *begin != 'L') {
-    presentation_type type = parse_presentation_type(*begin++);
-    if (type == presentation_type::none)
-      handler.on_error("invalid type specifier");
-    handler.on_type(type);
-    return begin;
-  }
-
-  if (begin == end) return begin;
-
-  begin = parse_align(begin, end, handler);
-  if (begin == end) return begin;
-
-  // Parse sign.
-  switch (to_ascii(*begin)) {
-  case '+':
-    handler.on_sign(sign::plus);
-    ++begin;
-    break;
-  case '-':
-    handler.on_sign(sign::minus);
-    ++begin;
-    break;
-  case ' ':
-    handler.on_sign(sign::space);
-    ++begin;
-    break;
-  default:
-    break;
-  }
-  if (begin == end) return begin;
-
-  if (*begin == '#') {
-    handler.on_hash();
-    if (++begin == end) return begin;
-  }
-
-  // Parse zero flag.
-  if (*begin == '0') {
-    handler.on_zero();
-    if (++begin == end) return begin;
-  }
-
-  begin = parse_width(begin, end, handler);
-  if (begin == end) return begin;
-
-  // Parse precision.
-  if (*begin == '.') {
-    begin = parse_precision(begin, end, handler);
-    if (begin == end) return begin;
-  }
-
-  if (*begin == 'L') {
-    handler.on_localized();
-    ++begin;
-  }
-
-  // Parse type.
-  if (begin != end && *begin != '}') {
-    presentation_type type = parse_presentation_type(*begin++);
-    if (type == presentation_type::none)
-      handler.on_error("invalid type specifier");
-    handler.on_type(type);
-  }
-  return begin;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR auto parse_replacement_field(const Char* begin, const Char* end,
-                                           Handler&& handler) -> const Char* {
-  struct id_adapter {
-    Handler& handler;
-    int arg_id;
-
-    FMT_CONSTEXPR void operator()() { arg_id = handler.on_arg_id(); }
-    FMT_CONSTEXPR void operator()(int id) { arg_id = handler.on_arg_id(id); }
-    FMT_CONSTEXPR void operator()(basic_string_view<Char> id) {
-      arg_id = handler.on_arg_id(id);
-    }
-    FMT_CONSTEXPR void on_error(const char* message) {
-      if (message) handler.on_error(message);
-    }
-  };
-
-  ++begin;
-  if (begin == end) return handler.on_error("invalid format string"), end;
-  if (*begin == '}') {
-    handler.on_replacement_field(handler.on_arg_id(), begin);
-  } else if (*begin == '{') {
-    handler.on_text(begin, begin + 1);
-  } else {
-    auto adapter = id_adapter{handler, 0};
-    begin = parse_arg_id(begin, end, adapter);
-    Char c = begin != end ? *begin : Char();
-    if (c == '}') {
-      handler.on_replacement_field(adapter.arg_id, begin);
-    } else if (c == ':') {
-      begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
-      if (begin == end || *begin != '}')
-        return handler.on_error("unknown format specifier"), end;
-    } else {
-      return handler.on_error("missing '}' in format string"), end;
-    }
-  }
-  return begin + 1;
-}
-
-template <bool IS_CONSTEXPR, typename Char, typename Handler>
-FMT_CONSTEXPR FMT_INLINE void parse_format_string(
-    basic_string_view<Char> format_str, Handler&& handler) {
-  // Workaround a name-lookup bug in MSVC's modules implementation.
-  using detail::find;
-
-  auto begin = format_str.data();
-  auto end = begin + format_str.size();
-  if (end - begin < 32) {
-    // Use a simple loop instead of memchr for small strings.
-    const Char* p = begin;
-    while (p != end) {
-      auto c = *p++;
-      if (c == '{') {
-        handler.on_text(begin, p - 1);
-        begin = p = parse_replacement_field(p - 1, end, handler);
-      } else if (c == '}') {
-        if (p == end || *p != '}')
-          return handler.on_error("unmatched '}' in format string");
-        handler.on_text(begin, p);
-        begin = ++p;
-      }
-    }
-    handler.on_text(begin, end);
-    return;
-  }
-  struct writer {
-    FMT_CONSTEXPR void operator()(const Char* from, const Char* to) {
-      if (from == to) return;
-      for (;;) {
-        const Char* p = nullptr;
-        if (!find<IS_CONSTEXPR>(from, to, Char('}'), p))
-          return handler_.on_text(from, to);
-        ++p;
-        if (p == to || *p != '}')
-          return handler_.on_error("unmatched '}' in format string");
-        handler_.on_text(from, p);
-        from = p + 1;
-      }
-    }
-    Handler& handler_;
-  } write = {handler};
-  while (begin != end) {
-    // Doing two passes with memchr (one for '{' and another for '}') is up to
-    // 2.5x faster than the naive one-pass implementation on big format strings.
-    const Char* p = begin;
-    if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, Char('{'), p))
-      return write(begin, end);
-    write(begin, p);
-    begin = parse_replacement_field(p, end, handler);
-  }
-}
-
-template <typename T, bool = is_named_arg<T>::value> struct strip_named_arg {
-  using type = T;
-};
-template <typename T> struct strip_named_arg<T, true> {
-  using type = remove_cvref_t<decltype(T::value)>;
-};
-
-template <typename T, typename ParseContext>
-FMT_CONSTEXPR auto parse_format_specs(ParseContext& ctx)
-    -> decltype(ctx.begin()) {
-  using char_type = typename ParseContext::char_type;
-  using context = buffer_context<char_type>;
-  using stripped_type = typename strip_named_arg<T>::type;
-  using mapped_type = conditional_t<
-      mapped_type_constant<T, context>::value != type::custom_type,
-      decltype(arg_mapper<context>().map(std::declval<const T&>())),
-      stripped_type>;
-  auto f = conditional_t<has_formatter<mapped_type, context>::value,
-                         formatter<mapped_type, char_type>,
-                         fallback_formatter<stripped_type, char_type>>();
-  return f.parse(ctx);
-}
-
-template <typename ErrorHandler>
-FMT_CONSTEXPR void check_int_type_spec(presentation_type type,
-                                       ErrorHandler&& eh) {
-  if (type > presentation_type::bin_upper && type != presentation_type::chr)
-    eh.on_error("invalid type specifier");
-}
-
-// Checks char specs and returns true if the type spec is char (and not int).
-template <typename Char, typename ErrorHandler = error_handler>
-FMT_CONSTEXPR auto check_char_specs(const basic_format_specs<Char>& specs,
-                                    ErrorHandler&& eh = {}) -> bool {
-  if (specs.type != presentation_type::none &&
-      specs.type != presentation_type::chr &&
-      specs.type != presentation_type::debug) {
-    check_int_type_spec(specs.type, eh);
-    return false;
-  }
-  if (specs.align == align::numeric || specs.sign != sign::none || specs.alt)
-    eh.on_error("invalid format specifier for char");
-  return true;
-}
-
-// A floating-point presentation format.
-enum class float_format : unsigned char {
-  general,  // General: exponent notation or fixed point based on magnitude.
-  exp,      // Exponent notation with the default precision of 6, e.g. 1.2e-3.
-  fixed,    // Fixed point with the default precision of 6, e.g. 0.0012.
-  hex
-};
-
-struct float_specs {
-  int precision;
-  float_format format : 8;
-  sign_t sign : 8;
-  bool upper : 1;
-  bool locale : 1;
-  bool binary32 : 1;
-  bool showpoint : 1;
-};
-
-template <typename ErrorHandler = error_handler, typename Char>
-FMT_CONSTEXPR auto parse_float_type_spec(const basic_format_specs<Char>& specs,
-                                         ErrorHandler&& eh = {})
-    -> float_specs {
-  auto result = float_specs();
-  result.showpoint = specs.alt;
-  result.locale = specs.localized;
-  switch (specs.type) {
-  case presentation_type::none:
-    result.format = float_format::general;
-    break;
-  case presentation_type::general_upper:
-    result.upper = true;
-    FMT_FALLTHROUGH;
-  case presentation_type::general_lower:
-    result.format = float_format::general;
-    break;
-  case presentation_type::exp_upper:
-    result.upper = true;
-    FMT_FALLTHROUGH;
-  case presentation_type::exp_lower:
-    result.format = float_format::exp;
-    result.showpoint |= specs.precision != 0;
-    break;
-  case presentation_type::fixed_upper:
-    result.upper = true;
-    FMT_FALLTHROUGH;
-  case presentation_type::fixed_lower:
-    result.format = float_format::fixed;
-    result.showpoint |= specs.precision != 0;
-    break;
-  case presentation_type::hexfloat_upper:
-    result.upper = true;
-    FMT_FALLTHROUGH;
-  case presentation_type::hexfloat_lower:
-    result.format = float_format::hex;
-    break;
-  default:
-    eh.on_error("invalid type specifier");
-    break;
-  }
-  return result;
-}
-
-template <typename ErrorHandler = error_handler>
-FMT_CONSTEXPR auto check_cstring_type_spec(presentation_type type,
-                                           ErrorHandler&& eh = {}) -> bool {
-  if (type == presentation_type::none || type == presentation_type::string ||
-      type == presentation_type::debug)
-    return true;
-  if (type != presentation_type::pointer) eh.on_error("invalid type specifier");
-  return false;
-}
-
-template <typename ErrorHandler = error_handler>
-FMT_CONSTEXPR void check_string_type_spec(presentation_type type,
-                                          ErrorHandler&& eh = {}) {
-  if (type != presentation_type::none && type != presentation_type::string &&
-      type != presentation_type::debug)
-    eh.on_error("invalid type specifier");
-}
-
-template <typename ErrorHandler>
-FMT_CONSTEXPR void check_pointer_type_spec(presentation_type type,
-                                           ErrorHandler&& eh) {
-  if (type != presentation_type::none && type != presentation_type::pointer)
-    eh.on_error("invalid type specifier");
-}
-
-// A parse_format_specs handler that checks if specifiers are consistent with
-// the argument type.
-template <typename Handler> class specs_checker : public Handler {
- private:
-  detail::type arg_type_;
-
-  FMT_CONSTEXPR void require_numeric_argument() {
-    if (!is_arithmetic_type(arg_type_))
-      this->on_error("format specifier requires numeric argument");
-  }
-
- public:
-  FMT_CONSTEXPR specs_checker(const Handler& handler, detail::type arg_type)
-      : Handler(handler), arg_type_(arg_type) {}
-
-  FMT_CONSTEXPR void on_align(align_t align) {
-    if (align == align::numeric) require_numeric_argument();
-    Handler::on_align(align);
-  }
-
-  FMT_CONSTEXPR void on_sign(sign_t s) {
-    require_numeric_argument();
-    if (is_integral_type(arg_type_) && arg_type_ != type::int_type &&
-        arg_type_ != type::long_long_type && arg_type_ != type::int128_type &&
-        arg_type_ != type::char_type) {
-      this->on_error("format specifier requires signed argument");
-    }
-    Handler::on_sign(s);
-  }
-
-  FMT_CONSTEXPR void on_hash() {
-    require_numeric_argument();
-    Handler::on_hash();
-  }
-
-  FMT_CONSTEXPR void on_localized() {
-    require_numeric_argument();
-    Handler::on_localized();
-  }
-
-  FMT_CONSTEXPR void on_zero() {
-    require_numeric_argument();
-    Handler::on_zero();
-  }
-
-  FMT_CONSTEXPR void end_precision() {
-    if (is_integral_type(arg_type_) || arg_type_ == type::pointer_type)
-      this->on_error("precision not allowed for this argument type");
-  }
-};
-
-constexpr int invalid_arg_index = -1;
-
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-template <int N, typename T, typename... Args, typename Char>
-constexpr auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
-  if constexpr (detail::is_statically_named_arg<T>()) {
-    if (name == T::name) return N;
-  }
-  if constexpr (sizeof...(Args) > 0)
-    return get_arg_index_by_name<N + 1, Args...>(name);
-  (void)name;  // Workaround an MSVC bug about "unused" parameter.
-  return invalid_arg_index;
-}
-#endif
-
-template <typename... Args, typename Char>
-FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-  if constexpr (sizeof...(Args) > 0)
-    return get_arg_index_by_name<0, Args...>(name);
-#endif
-  (void)name;
-  return invalid_arg_index;
-}
-
-template <typename Char, typename ErrorHandler, typename... Args>
-class format_string_checker {
- private:
-  // In the future basic_format_parse_context will replace compile_parse_context
-  // here and will use is_constant_evaluated and downcasting to access the data
-  // needed for compile-time checks: https://godbolt.org/z/GvWzcTjh1.
-  using parse_context_type = compile_parse_context<Char, ErrorHandler>;
-  static constexpr int num_args = sizeof...(Args);
-
-  // Format specifier parsing function.
-  using parse_func = const Char* (*)(parse_context_type&);
-
-  parse_context_type context_;
-  parse_func parse_funcs_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
-  type types_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
-
- public:
-  explicit FMT_CONSTEXPR format_string_checker(
-      basic_string_view<Char> format_str, ErrorHandler eh)
-      : context_(format_str, num_args, types_, eh),
-        parse_funcs_{&parse_format_specs<Args, parse_context_type>...},
-        types_{
-            mapped_type_constant<Args,
-                                 basic_format_context<Char*, Char>>::value...} {
-  }
-
-  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
-
-  FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
-  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
-    return context_.check_arg_id(id), id;
-  }
-  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-    auto index = get_arg_index_by_name<Args...>(id);
-    if (index == invalid_arg_index) on_error("named argument is not found");
-    return context_.check_arg_id(index), index;
-#else
-    (void)id;
-    on_error("compile-time checks for named arguments require C++20 support");
-    return 0;
-#endif
-  }
-
-  FMT_CONSTEXPR void on_replacement_field(int, const Char*) {}
-
-  FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char*)
-      -> const Char* {
-    context_.advance_to(context_.begin() + (begin - &*context_.begin()));
-    // id >= 0 check is a workaround for gcc 10 bug (#2065).
-    return id >= 0 && id < num_args ? parse_funcs_[id](context_) : begin;
-  }
-
-  FMT_CONSTEXPR void on_error(const char* message) {
-    context_.on_error(message);
-  }
-};
-
-// Reports a compile-time error if S is not a valid format string.
-template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
-FMT_INLINE void check_format_string(const S&) {
-#ifdef FMT_ENFORCE_COMPILE_STRING
-  static_assert(is_compile_string<S>::value,
-                "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
-                "FMT_STRING.");
-#endif
-}
-template <typename... Args, typename S,
-          FMT_ENABLE_IF(is_compile_string<S>::value)>
-void check_format_string(S format_str) {
-  FMT_CONSTEXPR auto s = basic_string_view<typename S::char_type>(format_str);
-  using checker = format_string_checker<typename S::char_type, error_handler,
-                                        remove_cvref_t<Args>...>;
-  FMT_CONSTEXPR bool invalid_format =
-      (parse_format_string<true>(s, checker(s, {})), true);
-  ignore_unused(invalid_format);
-}
-
-// Don't use type_identity for args to simplify symbols.
-template <typename Char>
-void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
-                basic_format_args<FMT_BUFFER_CONTEXT(Char)> args,
-                locale_ref loc = {});
-
-FMT_API void vprint_mojibake(std::FILE*, string_view, format_args);
-#ifndef _WIN32
-inline void vprint_mojibake(std::FILE*, string_view, format_args) {}
-#endif
-FMT_END_DETAIL_NAMESPACE
-
-// A formatter specialization for the core types corresponding to detail::type
-// constants.
-template <typename T, typename Char>
-struct formatter<T, Char,
-                 enable_if_t<detail::type_constant<T, Char>::value !=
-                             detail::type::custom_type>> {
- private:
-  detail::dynamic_format_specs<Char> specs_;
-
- public:
-  // Parses format specifiers stopping either at the end of the range or at the
-  // terminating '}'.
-  template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    auto begin = ctx.begin(), end = ctx.end();
-    if (begin == end) return begin;
-    using handler_type = detail::dynamic_specs_handler<ParseContext>;
-    auto type = detail::type_constant<T, Char>::value;
-    auto checker =
-        detail::specs_checker<handler_type>(handler_type(specs_, ctx), type);
-    auto it = detail::parse_format_specs(begin, end, checker);
-    auto eh = ctx.error_handler();
-    switch (type) {
-    case detail::type::none_type:
-      FMT_ASSERT(false, "invalid argument type");
-      break;
-    case detail::type::bool_type:
-      if (specs_.type == presentation_type::none ||
-          specs_.type == presentation_type::string) {
-        break;
-      }
-      FMT_FALLTHROUGH;
-    case detail::type::int_type:
-    case detail::type::uint_type:
-    case detail::type::long_long_type:
-    case detail::type::ulong_long_type:
-    case detail::type::int128_type:
-    case detail::type::uint128_type:
-      detail::check_int_type_spec(specs_.type, eh);
-      break;
-    case detail::type::char_type:
-      detail::check_char_specs(specs_, eh);
-      break;
-    case detail::type::float_type:
-      if (detail::const_check(FMT_USE_FLOAT))
-        detail::parse_float_type_spec(specs_, eh);
-      else
-        FMT_ASSERT(false, "float support disabled");
-      break;
-    case detail::type::double_type:
-      if (detail::const_check(FMT_USE_DOUBLE))
-        detail::parse_float_type_spec(specs_, eh);
-      else
-        FMT_ASSERT(false, "double support disabled");
-      break;
-    case detail::type::long_double_type:
-      if (detail::const_check(FMT_USE_LONG_DOUBLE))
-        detail::parse_float_type_spec(specs_, eh);
-      else
-        FMT_ASSERT(false, "long double support disabled");
-      break;
-    case detail::type::cstring_type:
-      detail::check_cstring_type_spec(specs_.type, eh);
-      break;
-    case detail::type::string_type:
-      detail::check_string_type_spec(specs_.type, eh);
-      break;
-    case detail::type::pointer_type:
-      detail::check_pointer_type_spec(specs_.type, eh);
-      break;
-    case detail::type::custom_type:
-      // Custom format specifiers are checked in parse functions of
-      // formatter specializations.
-      break;
-    }
-    return it;
-  }
-
-  template <detail::type U = detail::type_constant<T, Char>::value,
-            enable_if_t<(U == detail::type::string_type ||
-                         U == detail::type::cstring_type ||
-                         U == detail::type::char_type),
-                        int> = 0>
-  FMT_CONSTEXPR void set_debug_format() {
-    specs_.type = presentation_type::debug;
-  }
-
-  template <typename FormatContext>
-  FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const
-      -> decltype(ctx.out());
-};
-
-#define FMT_FORMAT_AS(Type, Base)                                        \
-  template <typename Char>                                               \
-  struct formatter<Type, Char> : formatter<Base, Char> {                 \
-    template <typename FormatContext>                                    \
-    auto format(Type const& val, FormatContext& ctx) const               \
-        -> decltype(ctx.out()) {                                         \
-      return formatter<Base, Char>::format(static_cast<Base>(val), ctx); \
-    }                                                                    \
-  }
-
-FMT_FORMAT_AS(signed char, int);
-FMT_FORMAT_AS(unsigned char, unsigned);
-FMT_FORMAT_AS(short, int);
-FMT_FORMAT_AS(unsigned short, unsigned);
-FMT_FORMAT_AS(long, long long);
-FMT_FORMAT_AS(unsigned long, unsigned long long);
-FMT_FORMAT_AS(Char*, const Char*);
-FMT_FORMAT_AS(std::basic_string<Char>, basic_string_view<Char>);
-FMT_FORMAT_AS(std::nullptr_t, const void*);
-FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
-
-template <typename Char> struct basic_runtime { basic_string_view<Char> str; };
-
-/** A compile-time format string. */
-template <typename Char, typename... Args> class basic_format_string {
- private:
-  basic_string_view<Char> str_;
-
- public:
-  template <typename S,
-            FMT_ENABLE_IF(
-                std::is_convertible<const S&, basic_string_view<Char>>::value)>
-  FMT_CONSTEVAL FMT_INLINE basic_format_string(const S& s) : str_(s) {
-    static_assert(
-        detail::count<
-            (std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
-             std::is_reference<Args>::value)...>() == 0,
-        "passing views as lvalues is disallowed");
-#ifdef FMT_HAS_CONSTEVAL
-    if constexpr (detail::count_named_args<Args...>() ==
-                  detail::count_statically_named_args<Args...>()) {
-      using checker = detail::format_string_checker<Char, detail::error_handler,
-                                                    remove_cvref_t<Args>...>;
-      detail::parse_format_string<true>(str_, checker(s, {}));
-    }
-#else
-    detail::check_format_string<Args...>(s);
-#endif
-  }
-  basic_format_string(basic_runtime<Char> r) : str_(r.str) {}
-
-  FMT_INLINE operator basic_string_view<Char>() const { return str_; }
-  FMT_INLINE basic_string_view<Char> get() const { return str_; }
-};
-
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-// Workaround broken conversion on older gcc.
-template <typename...> using format_string = string_view;
-inline auto runtime(string_view s) -> string_view { return s; }
-#else
-template <typename... Args>
-using format_string = basic_format_string<char, type_identity_t<Args>...>;
-/**
-  \rst
-  Creates a runtime format string.
-
-  **Example**::
-
-    // Check format string at runtime instead of compile-time.
-    fmt::print(fmt::runtime("{:d}"), "I am not a number");
-  \endrst
- */
-inline auto runtime(string_view s) -> basic_runtime<char> { return {{s}}; }
-#endif
-
-FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``fmt`` and returns the result
-  as a string.
-
-  **Example**::
-
-    #include <fmt/core.h>
-    std::string message = fmt::format("The answer is {}.", 42);
-  \endrst
-*/
-template <typename... T>
-FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
-    -> std::string {
-  return vformat(fmt, fmt::make_format_args(args...));
-}
-
-/** Formats a string and writes the output to ``out``. */
-template <typename OutputIt,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-auto vformat_to(OutputIt out, string_view fmt, format_args args) -> OutputIt {
-  auto&& buf = detail::get_buffer<char>(out);
-  detail::vformat_to(buf, fmt, args, {});
-  return detail::get_iterator(buf, out);
-}
-
-/**
- \rst
- Formats ``args`` according to specifications in ``fmt``, writes the result to
- the output iterator ``out`` and returns the iterator past the end of the output
- range. `format_to` does not append a terminating null character.
-
- **Example**::
-
-   auto out = std::vector<char>();
-   fmt::format_to(std::back_inserter(out), "{}", 42);
- \endrst
- */
-template <typename OutputIt, typename... T,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-FMT_INLINE auto format_to(OutputIt out, format_string<T...> fmt, T&&... args)
-    -> OutputIt {
-  return vformat_to(out, fmt, fmt::make_format_args(args...));
-}
-
-template <typename OutputIt> struct format_to_n_result {
-  /** Iterator past the end of the output range. */
-  OutputIt out;
-  /** Total (not truncated) output size. */
-  size_t size;
-};
-
-template <typename OutputIt, typename... T,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
-    -> format_to_n_result<OutputIt> {
-  using traits = detail::fixed_buffer_traits;
-  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
-  detail::vformat_to(buf, fmt, args, {});
-  return {buf.out(), buf.count()};
-}
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``fmt``, writes up to ``n``
-  characters of the result to the output iterator ``out`` and returns the total
-  (not truncated) output size and the iterator past the end of the output range.
-  `format_to_n` does not append a terminating null character.
-  \endrst
- */
-template <typename OutputIt, typename... T,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
-                            T&&... args) -> format_to_n_result<OutputIt> {
-  return vformat_to_n(out, n, fmt, fmt::make_format_args(args...));
-}
-
-/** Returns the number of chars in the output of ``format(fmt, args...)``. */
-template <typename... T>
-FMT_NODISCARD FMT_INLINE auto formatted_size(format_string<T...> fmt,
-                                             T&&... args) -> size_t {
-  auto buf = detail::counting_buffer<>();
-  detail::vformat_to(buf, string_view(fmt),
-                     format_args(fmt::make_format_args(args...)), {});
-  return buf.count();
-}
-
-FMT_API void vprint(string_view fmt, format_args args);
-FMT_API void vprint(std::FILE* f, string_view fmt, format_args args);
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``fmt`` and writes the output
-  to ``stdout``.
-
-  **Example**::
-
-    fmt::print("Elapsed time: {0:.2f} seconds", 1.23);
-  \endrst
- */
-template <typename... T>
-FMT_INLINE void print(format_string<T...> fmt, T&&... args) {
-  const auto& vargs = fmt::make_format_args(args...);
-  return detail::is_utf8() ? vprint(fmt, vargs)
-                           : detail::vprint_mojibake(stdout, fmt, vargs);
-}
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``fmt`` and writes the
-  output to the file ``f``.
-
-  **Example**::
-
-    fmt::print(stderr, "Don't {}!", "panic");
-  \endrst
- */
-template <typename... T>
-FMT_INLINE void print(std::FILE* f, format_string<T...> fmt, T&&... args) {
-  const auto& vargs = fmt::make_format_args(args...);
-  return detail::is_utf8() ? vprint(f, fmt, vargs)
-                           : detail::vprint_mojibake(f, fmt, vargs);
-}
-
-FMT_MODULE_EXPORT_END
-FMT_GCC_PRAGMA("GCC pop_options")
-FMT_END_NAMESPACE
-
-#ifdef FMT_HEADER_ONLY
-#  include "format.h"
-#endif
-#endif  // FMT_CORE_H_
+#include "format.h"
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/format-inl.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/format-inl.h
index 9ac55e47f22f..a887483b6f46 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/format-inl.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/format-inl.h
@@ -8,21 +8,19 @@
 #ifndef FMT_FORMAT_INL_H_
 #define FMT_FORMAT_INL_H_
 
-#include <algorithm>
-#include <cctype>
-#include <cerrno>  // errno
-#include <climits>
-#include <cmath>
-#include <cstdarg>
-#include <cstring>  // std::memmove
-#include <cwchar>
-#include <exception>
-
-#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
-#  include <locale>
+#ifndef FMT_MODULE
+#  include <algorithm>
+#  include <cerrno>  // errno
+#  include <climits>
+#  include <cmath>
+#  include <exception>
+
+#  if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+#    include <locale>
+#  endif
 #endif
 
-#ifdef _WIN32
+#if defined(_WIN32) && !defined(FMT_USE_WRITE_CONSOLE)
 #  include <io.h>  // _isatty
 #endif
 
@@ -40,10 +38,6 @@ FMT_FUNC void assert_fail(const char* file, int line, const char* message) {
   std::terminate();
 }
 
-FMT_FUNC void throw_format_error(const char* message) {
-  FMT_THROW(format_error(message));
-}
-
 FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
                                 string_view message) noexcept {
   // Report error code making sure that the output fits into
@@ -60,10 +54,10 @@ FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
     ++error_code_size;
   }
   error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
-  auto it = buffer_appender<char>(out);
+  auto it = appender(out);
   if (message.size() <= inline_buffer_size - error_code_size)
-    format_to(it, FMT_STRING("{}{}"), message, SEP);
-  format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
+    fmt::format_to(it, FMT_STRING("{}{}"), message, SEP);
+  fmt::format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
   FMT_ASSERT(out.size() <= inline_buffer_size, "");
 }
 
@@ -77,13 +71,10 @@ FMT_FUNC void report_error(format_func func, int error_code,
 }
 
 // A wrapper around fwrite that throws on error.
-inline void fwrite_fully(const void* ptr, size_t size, size_t count,
-                         FILE* stream) {
-  size_t written = std::fwrite(ptr, size, count, stream);
-#if !__NVCC__
+inline void fwrite_fully(const void* ptr, size_t count, FILE* stream) {
+  size_t written = std::fwrite(ptr, 1, count, stream);
   if (written < count)
     FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
-#endif
 }
 
 #ifndef FMT_STATIC_THOUSANDS_SEPARATOR
@@ -92,7 +83,7 @@ locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
   static_assert(std::is_same<Locale, std::locale>::value, "");
 }
 
-template <typename Locale> Locale locale_ref::get() const {
+template <typename Locale> auto locale_ref::get() const -> Locale {
   static_assert(std::is_same<Locale, std::locale>::value, "");
   return locale_ ? *static_cast<const std::locale*>(locale_) : std::locale();
 }
@@ -104,7 +95,8 @@ FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char> {
   auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep();
   return {std::move(grouping), thousands_sep};
 }
-template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref loc) {
+template <typename Char>
+FMT_FUNC auto decimal_point_impl(locale_ref loc) -> Char {
   return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
       .decimal_point();
 }
@@ -120,7 +112,11 @@ template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref) {
 
 FMT_FUNC auto write_loc(appender out, loc_value value,
                         const format_specs& specs, locale_ref loc) -> bool {
-#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+#ifdef FMT_STATIC_THOUSANDS_SEPARATOR
+  value.visit(loc_writer<>{
+      out, specs, std::string(1, FMT_STATIC_THOUSANDS_SEPARATOR), "\3", "."});
+  return true;
+#else
   auto locale = loc.get<std::locale>();
   // We cannot use the num_put<char> facet because it may produce output in
   // a wrong encoding.
@@ -129,10 +125,13 @@ FMT_FUNC auto write_loc(appender out, loc_value value,
     return std::use_facet<facet>(locale).put(out, value, specs);
   return facet(locale).put(out, value, specs);
 #endif
-  return false;
 }
 }  // namespace detail
 
+FMT_FUNC void report_error(const char* message) {
+  FMT_THROW(format_error(message));
+}
+
 template <typename Locale> typename Locale::id format_facet<Locale>::id;
 
 #ifndef FMT_STATIC_THOUSANDS_SEPARATOR
@@ -150,96 +149,41 @@ FMT_API FMT_FUNC auto format_facet<std::locale>::do_put(
 }
 #endif
 
-#if !FMT_MSC_VERSION
-FMT_API FMT_FUNC format_error::~format_error() noexcept = default;
-#endif
-
-#if !__NVCC__
-FMT_FUNC std::system_error vsystem_error(int error_code, string_view format_str,
-                                         format_args args) {
+FMT_FUNC auto vsystem_error(int error_code, string_view fmt, format_args args)
+    -> std::system_error {
   auto ec = std::error_code(error_code, std::generic_category());
-  return std::system_error(ec, vformat(format_str, args));
+  return std::system_error(ec, vformat(fmt, args));
 }
-#endif
 
 namespace detail {
 
-template <typename F> inline bool operator==(basic_fp<F> x, basic_fp<F> y) {
+template <typename F>
+inline auto operator==(basic_fp<F> x, basic_fp<F> y) -> bool {
   return x.f == y.f && x.e == y.e;
 }
 
 // Compilers should be able to optimize this into the ror instruction.
-FMT_CONSTEXPR inline uint32_t rotr(uint32_t n, uint32_t r) noexcept {
+FMT_CONSTEXPR inline auto rotr(uint32_t n, uint32_t r) noexcept -> uint32_t {
   r &= 31;
   return (n >> r) | (n << (32 - r));
 }
-FMT_CONSTEXPR inline uint64_t rotr(uint64_t n, uint32_t r) noexcept {
+FMT_CONSTEXPR inline auto rotr(uint64_t n, uint32_t r) noexcept -> uint64_t {
   r &= 63;
   return (n >> r) | (n << (64 - r));
 }
 
-// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
-inline uint128_fallback umul128(uint64_t x, uint64_t y) noexcept {
-#if FMT_USE_INT128
-  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
-  return {static_cast<uint64_t>(p >> 64), static_cast<uint64_t>(p)};
-#elif defined(_MSC_VER) && defined(_M_X64)
-  auto result = uint128_fallback();
-  result.lo_ = _umul128(x, y, &result.hi_);
-  return result;
-#else
-  const uint64_t mask = static_cast<uint64_t>(max_value<uint32_t>());
-
-  uint64_t a = x >> 32;
-  uint64_t b = x & mask;
-  uint64_t c = y >> 32;
-  uint64_t d = y & mask;
-
-  uint64_t ac = a * c;
-  uint64_t bc = b * c;
-  uint64_t ad = a * d;
-  uint64_t bd = b * d;
-
-  uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask);
-
-  return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32),
-          (intermediate << 32) + (bd & mask)};
-#endif
-}
-
 // Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
 namespace dragonbox {
-// Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
-inline uint64_t umul128_upper64(uint64_t x, uint64_t y) noexcept {
-#if FMT_USE_INT128
-  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
-  return static_cast<uint64_t>(p >> 64);
-#elif defined(_MSC_VER) && defined(_M_X64)
-  return __umulh(x, y);
-#else
-  return umul128(x, y).high();
-#endif
-}
-
-// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a
-// 128-bit unsigned integer.
-inline uint128_fallback umul192_upper128(uint64_t x,
-                                         uint128_fallback y) noexcept {
-  uint128_fallback r = umul128(x, y.high());
-  r += umul128_upper64(x, y.low());
-  return r;
-}
-
 // Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a
 // 64-bit unsigned integer.
-inline uint64_t umul96_upper64(uint32_t x, uint64_t y) noexcept {
+inline auto umul96_upper64(uint32_t x, uint64_t y) noexcept -> uint64_t {
   return umul128_upper64(static_cast<uint64_t>(x) << 32, y);
 }
 
 // Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a
 // 128-bit unsigned integer.
-inline uint128_fallback umul192_lower128(uint64_t x,
-                                         uint128_fallback y) noexcept {
+inline auto umul192_lower128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
   uint64_t high = x * y.high();
   uint128_fallback high_low = umul128(x, y.low());
   return {high + high_low.high(), high_low.low()};
@@ -247,29 +191,17 @@ inline uint128_fallback umul192_lower128(uint64_t x,
 
 // Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a
 // 64-bit unsigned integer.
-inline uint64_t umul96_lower64(uint32_t x, uint64_t y) noexcept {
+inline auto umul96_lower64(uint32_t x, uint64_t y) noexcept -> uint64_t {
   return x * y;
 }
 
-// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from
-// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1.
-inline int floor_log10_pow2(int e) noexcept {
-  FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
-  static_assert((-1 >> 1) == -1, "right shift is not arithmetic");
-  return (e * 315653) >> 20;
-}
-
 // Various fast log computations.
-inline int floor_log2_pow10(int e) noexcept {
-  FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
-  return (e * 1741647) >> 19;
-}
-inline int floor_log10_pow2_minus_log10_4_over_3(int e) noexcept {
+inline auto floor_log10_pow2_minus_log10_4_over_3(int e) noexcept -> int {
   FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent");
   return (e * 631305 - 261663) >> 21;
 }
 
-static constexpr struct {
+FMT_INLINE_VARIABLE constexpr struct {
   uint32_t divisor;
   int shift_amount;
 } div_small_pow10_infos[] = {{10, 16}, {100, 16}};
@@ -278,7 +210,7 @@ static constexpr struct {
 // divisible by pow(10, N).
 // Precondition: n <= pow(10, N + 1).
 template <int N>
-bool check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept {
+auto check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept -> bool {
   // The numbers below are chosen such that:
   //   1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100,
   //   2. nm mod 2^k < m if and only if n is divisible by d,
@@ -303,7 +235,7 @@ bool check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept {
 
 // Computes floor(n / pow(10, N)) for small n and N.
 // Precondition: n <= pow(10, N + 1).
-template <int N> uint32_t small_division_by_pow10(uint32_t n) noexcept {
+template <int N> auto small_division_by_pow10(uint32_t n) noexcept -> uint32_t {
   constexpr auto info = div_small_pow10_infos[N - 1];
   FMT_ASSERT(n <= info.divisor * 10, "n is too large");
   constexpr uint32_t magic_number =
@@ -312,24 +244,24 @@ template <int N> uint32_t small_division_by_pow10(uint32_t n) noexcept {
 }
 
 // Computes floor(n / 10^(kappa + 1)) (float)
-inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) noexcept {
+inline auto divide_by_10_to_kappa_plus_1(uint32_t n) noexcept -> uint32_t {
   // 1374389535 = ceil(2^37/100)
   return static_cast<uint32_t>((static_cast<uint64_t>(n) * 1374389535) >> 37);
 }
 // Computes floor(n / 10^(kappa + 1)) (double)
-inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) noexcept {
+inline auto divide_by_10_to_kappa_plus_1(uint64_t n) noexcept -> uint64_t {
   // 2361183241434822607 = ceil(2^(64+7)/1000)
   return umul128_upper64(n, 2361183241434822607ull) >> 7;
 }
 
 // Various subroutines using pow10 cache
-template <class T> struct cache_accessor;
+template <typename T> struct cache_accessor;
 
 template <> struct cache_accessor<float> {
   using carrier_uint = float_info<float>::carrier_uint;
   using cache_entry_type = uint64_t;
 
-  static uint64_t get_cached_power(int k) noexcept {
+  static auto get_cached_power(int k) noexcept -> uint64_t {
     FMT_ASSERT(k >= float_info<float>::min_k && k <= float_info<float>::max_k,
                "k is out of range");
     static constexpr const uint64_t pow10_significands[] = {
@@ -371,20 +303,23 @@ template <> struct cache_accessor<float> {
     bool is_integer;
   };
 
-  static compute_mul_result compute_mul(
-      carrier_uint u, const cache_entry_type& cache) noexcept {
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
     auto r = umul96_upper64(u, cache);
     return {static_cast<carrier_uint>(r >> 32),
             static_cast<carrier_uint>(r) == 0};
   }
 
-  static uint32_t compute_delta(const cache_entry_type& cache,
-                                int beta) noexcept {
+  static auto compute_delta(const cache_entry_type& cache, int beta) noexcept
+      -> uint32_t {
     return static_cast<uint32_t>(cache >> (64 - 1 - beta));
   }
 
-  static compute_mul_parity_result compute_mul_parity(
-      carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
     FMT_ASSERT(beta >= 1, "");
     FMT_ASSERT(beta < 64, "");
 
@@ -393,22 +328,22 @@ template <> struct cache_accessor<float> {
             static_cast<uint32_t>(r >> (32 - beta)) == 0};
   }
 
-  static carrier_uint compute_left_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return static_cast<carrier_uint>(
         (cache - (cache >> (num_significand_bits<float>() + 2))) >>
         (64 - num_significand_bits<float>() - 1 - beta));
   }
 
-  static carrier_uint compute_right_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return static_cast<carrier_uint>(
         (cache + (cache >> (num_significand_bits<float>() + 1))) >>
         (64 - num_significand_bits<float>() - 1 - beta));
   }
 
-  static carrier_uint compute_round_up_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return (static_cast<carrier_uint>(
                 cache >> (64 - num_significand_bits<float>() - 2 - beta)) +
             1) /
@@ -420,7 +355,7 @@ template <> struct cache_accessor<double> {
   using carrier_uint = float_info<double>::carrier_uint;
   using cache_entry_type = uint128_fallback;
 
-  static uint128_fallback get_cached_power(int k) noexcept {
+  static auto get_cached_power(int k) noexcept -> uint128_fallback {
     FMT_ASSERT(k >= float_info<double>::min_k && k <= float_info<double>::max_k,
                "k is out of range");
 
@@ -1044,8 +979,22 @@ template <> struct cache_accessor<double> {
       {0xfcf62c1dee382c42, 0x46729e03dd9ed7b6},
       {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d2},
       {0xc5a05277621be293, 0xc7098b7305241886},
-      { 0xf70867153aa2db38,
-        0xb8cbee4fc66d1ea8 }
+      {0xf70867153aa2db38, 0xb8cbee4fc66d1ea8},
+      {0x9a65406d44a5c903, 0x737f74f1dc043329},
+      {0xc0fe908895cf3b44, 0x505f522e53053ff3},
+      {0xf13e34aabb430a15, 0x647726b9e7c68ff0},
+      {0x96c6e0eab509e64d, 0x5eca783430dc19f6},
+      {0xbc789925624c5fe0, 0xb67d16413d132073},
+      {0xeb96bf6ebadf77d8, 0xe41c5bd18c57e890},
+      {0x933e37a534cbaae7, 0x8e91b962f7b6f15a},
+      {0xb80dc58e81fe95a1, 0x723627bbb5a4adb1},
+      {0xe61136f2227e3b09, 0xcec3b1aaa30dd91d},
+      {0x8fcac257558ee4e6, 0x213a4f0aa5e8a7b2},
+      {0xb3bd72ed2af29e1f, 0xa988e2cd4f62d19e},
+      {0xe0accfa875af45a7, 0x93eb1b80a33b8606},
+      {0x8c6c01c9498d8b88, 0xbc72f130660533c4},
+      {0xaf87023b9bf0ee6a, 0xeb8fad7c7f8680b5},
+      {0xdb68c2ca82ed2a05, 0xa67398db9f6820e2},
 #else
       {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
       {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
@@ -1069,8 +1018,8 @@ template <> struct cache_accessor<double> {
       {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
       {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
       {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
-      { 0x95527a5202df0ccb,
-        0x0f37801e0c43ebc9 }
+      {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
+      {0xf13e34aabb430a15, 0x647726b9e7c68ff0}
 #endif
     };
 
@@ -1130,19 +1079,22 @@ template <> struct cache_accessor<double> {
     bool is_integer;
   };
 
-  static compute_mul_result compute_mul(
-      carrier_uint u, const cache_entry_type& cache) noexcept {
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
     auto r = umul192_upper128(u, cache);
     return {r.high(), r.low() == 0};
   }
 
-  static uint32_t compute_delta(cache_entry_type const& cache,
-                                int beta) noexcept {
+  static auto compute_delta(cache_entry_type const& cache, int beta) noexcept
+      -> uint32_t {
     return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta));
   }
 
-  static compute_mul_parity_result compute_mul_parity(
-      carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
     FMT_ASSERT(beta >= 1, "");
     FMT_ASSERT(beta < 64, "");
 
@@ -1151,31 +1103,35 @@ template <> struct cache_accessor<double> {
             ((r.high() << beta) | (r.low() >> (64 - beta))) == 0};
   }
 
-  static carrier_uint compute_left_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return (cache.high() -
             (cache.high() >> (num_significand_bits<double>() + 2))) >>
            (64 - num_significand_bits<double>() - 1 - beta);
   }
 
-  static carrier_uint compute_right_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return (cache.high() +
             (cache.high() >> (num_significand_bits<double>() + 1))) >>
            (64 - num_significand_bits<double>() - 1 - beta);
   }
 
-  static carrier_uint compute_round_up_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta) noexcept {
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
     return ((cache.high() >> (64 - num_significand_bits<double>() - 2 - beta)) +
             1) /
            2;
   }
 };
 
+FMT_FUNC auto get_cached_power(int k) noexcept -> uint128_fallback {
+  return cache_accessor<double>::get_cached_power(k);
+}
+
 // Various integer checks
-template <class T>
-bool is_left_endpoint_integer_shorter_interval(int exponent) noexcept {
+template <typename T>
+auto is_left_endpoint_integer_shorter_interval(int exponent) noexcept -> bool {
   const int case_shorter_interval_left_endpoint_lower_threshold = 2;
   const int case_shorter_interval_left_endpoint_upper_threshold = 3;
   return exponent >= case_shorter_interval_left_endpoint_lower_threshold &&
@@ -1183,12 +1139,12 @@ bool is_left_endpoint_integer_shorter_interval(int exponent) noexcept {
 }
 
 // Remove trailing zeros from n and return the number of zeros removed (float)
-FMT_INLINE int remove_trailing_zeros(uint32_t& n) noexcept {
+FMT_INLINE int remove_trailing_zeros(uint32_t& n, int s = 0) noexcept {
   FMT_ASSERT(n != 0, "");
-  const uint32_t mod_inv_5 = 0xcccccccd;
-  const uint32_t mod_inv_25 = mod_inv_5 * mod_inv_5;
+  // Modular inverse of 5 (mod 2^32): (mod_inv_5 * 5) mod 2^32 = 1.
+  constexpr uint32_t mod_inv_5 = 0xcccccccd;
+  constexpr uint32_t mod_inv_25 = 0xc28f5c29;  // = mod_inv_5 * mod_inv_5
 
-  int s = 0;
   while (true) {
     auto q = rotr(n * mod_inv_25, 2);
     if (q > max_value<uint32_t>() / 100) break;
@@ -1200,7 +1156,6 @@ FMT_INLINE int remove_trailing_zeros(uint32_t& n) noexcept {
     n = q;
     s |= 1;
   }
-
   return s;
 }
 
@@ -1214,32 +1169,17 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
 
   // Is n is divisible by 10^8?
   if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) {
-    // If yes, work with the quotient.
+    // If yes, work with the quotient...
     auto n32 = static_cast<uint32_t>(nm.high() >> (90 - 64));
-
-    const uint32_t mod_inv_5 = 0xcccccccd;
-    const uint32_t mod_inv_25 = mod_inv_5 * mod_inv_5;
-
-    int s = 8;
-    while (true) {
-      auto q = rotr(n32 * mod_inv_25, 2);
-      if (q > max_value<uint32_t>() / 100) break;
-      n32 = q;
-      s += 2;
-    }
-    auto q = rotr(n32 * mod_inv_5, 1);
-    if (q <= max_value<uint32_t>() / 10) {
-      n32 = q;
-      s |= 1;
-    }
-
+    // ... and use the 32 bit variant of the function
+    int s = remove_trailing_zeros(n32, 8);
     n = n32;
     return s;
   }
 
   // If n is not divisible by 10^8, work with n itself.
-  const uint64_t mod_inv_5 = 0xcccccccccccccccd;
-  const uint64_t mod_inv_25 = mod_inv_5 * mod_inv_5;
+  constexpr uint64_t mod_inv_5 = 0xcccccccccccccccd;
+  constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29;  // mod_inv_5 * mod_inv_5
 
   int s = 0;
   while (true) {
@@ -1258,7 +1198,7 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
 }
 
 // The main algorithm for shorter interval case
-template <class T>
+template <typename T>
 FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
   decimal_fp<T> ret_value;
   // Compute k and beta
@@ -1305,7 +1245,7 @@ FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
   return ret_value;
 }
 
-template <typename T> decimal_fp<T> to_decimal(T x) noexcept {
+template <typename T> auto to_decimal(T x) noexcept -> decimal_fp<T> {
   // Step 1: integer promotion & Schubfach multiplier calculation.
 
   using carrier_uint = typename float_info<T>::carrier_uint;
@@ -1429,17 +1369,6 @@ template <typename T> decimal_fp<T> to_decimal(T x) noexcept {
   return ret_value;
 }
 }  // namespace dragonbox
-
-#ifdef _MSC_VER
-FMT_FUNC auto fmt_snprintf(char* buf, size_t size, const char* fmt, ...)
-    -> int {
-  auto args = va_list();
-  va_start(args, fmt);
-  int result = vsnprintf_s(buf, size, _TRUNCATE, fmt, args);
-  va_end(args);
-  return result;
-}
-#endif
 }  // namespace detail
 
 template <> struct formatter<detail::bigint> {
@@ -1455,15 +1384,15 @@ template <> struct formatter<detail::bigint> {
     for (auto i = n.bigits_.size(); i > 0; --i) {
       auto value = n.bigits_[i - 1u];
       if (first) {
-        out = format_to(out, FMT_STRING("{:x}"), value);
+        out = fmt::format_to(out, FMT_STRING("{:x}"), value);
         first = false;
         continue;
       }
-      out = format_to(out, FMT_STRING("{:08x}"), value);
+      out = fmt::format_to(out, FMT_STRING("{:08x}"), value);
     }
     if (n.exp_ > 0)
-      out = format_to(out, FMT_STRING("p{}"),
-                      n.exp_ * detail::bigint::bigit_bits);
+      out = fmt::format_to(out, FMT_STRING("p{}"),
+                           n.exp_ * detail::bigint::bigit_bits);
     return out;
   }
 };
@@ -1485,14 +1414,12 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
 
 FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
                                   const char* message) noexcept {
-#if !__NVCC__
   FMT_TRY {
     auto ec = std::error_code(error_code, std::generic_category());
-    write(std::back_inserter(out), std::system_error(ec, message).what());
+    detail::write(appender(out), std::system_error(ec, message).what());
     return;
   }
   FMT_CATCH(...) {}
-#endif
   format_error_code(out, error_code, message);
 }
 
@@ -1501,7 +1428,7 @@ FMT_FUNC void report_system_error(int error_code,
   report_error(format_system_error, error_code, message);
 }
 
-FMT_FUNC std::string vformat(string_view fmt, format_args args) {
+FMT_FUNC auto vformat(string_view fmt, format_args args) -> std::string {
   // Don't optimize the "{}" case to keep the binary size small and because it
   // can be better optimized in fmt::format anyway.
   auto buffer = memory_buffer();
@@ -1510,57 +1437,299 @@ FMT_FUNC std::string vformat(string_view fmt, format_args args) {
 }
 
 namespace detail {
-#ifdef _WIN32
+
+template <typename T> struct span {
+  T* data;
+  size_t size;
+};
+
+template <typename F> auto flockfile(F* f) -> decltype(_lock_file(f)) {
+  _lock_file(f);
+}
+template <typename F> auto funlockfile(F* f) -> decltype(_unlock_file(f)) {
+  _unlock_file(f);
+}
+
+#ifndef getc_unlocked
+template <typename F> auto getc_unlocked(F* f) -> decltype(_fgetc_nolock(f)) {
+  return _fgetc_nolock(f);
+}
+#endif
+
+template <typename F = FILE, typename Enable = void>
+struct has_flockfile : std::false_type {};
+
+template <typename F>
+struct has_flockfile<F, void_t<decltype(flockfile(&std::declval<F&>()))>>
+    : std::true_type {};
+
+// A FILE wrapper. F is FILE defined as a template parameter to make system API
+// detection work.
+template <typename F> class file_base {
+ public:
+  F* file_;
+
+ public:
+  file_base(F* file) : file_(file) {}
+  operator F*() const { return file_; }
+
+  // Reads a code unit from the stream.
+  auto get() -> int {
+    int result = getc_unlocked(file_);
+    if (result == EOF && ferror(file_) != 0)
+      FMT_THROW(system_error(errno, FMT_STRING("getc failed")));
+    return result;
+  }
+
+  // Puts the code unit back into the stream buffer.
+  void unget(char c) {
+    if (ungetc(c, file_) == EOF)
+      FMT_THROW(system_error(errno, FMT_STRING("ungetc failed")));
+  }
+
+  void flush() { fflush(this->file_); }
+};
+
+// A FILE wrapper for glibc.
+template <typename F> class glibc_file : public file_base<F> {
+ private:
+  enum {
+    line_buffered = 0x200,  // _IO_LINE_BUF
+    unbuffered = 2          // _IO_UNBUFFERED
+  };
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool {
+    return (this->file_->_flags & unbuffered) == 0;
+  }
+
+  void init_buffer() {
+    if (this->file_->_IO_write_ptr) return;
+    // Force buffer initialization by placing and removing a char in a buffer.
+    putc_unlocked(0, this->file_);
+    --this->file_->_IO_write_ptr;
+  }
+
+  // Returns the file's read buffer.
+  auto get_read_buffer() const -> span<const char> {
+    auto ptr = this->file_->_IO_read_ptr;
+    return {ptr, to_unsigned(this->file_->_IO_read_end - ptr)};
+  }
+
+  // Returns the file's write buffer.
+  auto get_write_buffer() const -> span<char> {
+    auto ptr = this->file_->_IO_write_ptr;
+    return {ptr, to_unsigned(this->file_->_IO_buf_end - ptr)};
+  }
+
+  void advance_write_buffer(size_t size) { this->file_->_IO_write_ptr += size; }
+
+  bool needs_flush() const {
+    if ((this->file_->_flags & line_buffered) == 0) return false;
+    char* end = this->file_->_IO_write_end;
+    return memchr(end, '\n', to_unsigned(this->file_->_IO_write_ptr - end));
+  }
+
+  void flush() { fflush_unlocked(this->file_); }
+};
+
+// A FILE wrapper for Apple's libc.
+template <typename F> class apple_file : public file_base<F> {
+ private:
+  enum {
+    line_buffered = 1,  // __SNBF
+    unbuffered = 2      // __SLBF
+  };
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool {
+    return (this->file_->_flags & unbuffered) == 0;
+  }
+
+  void init_buffer() {
+    if (this->file_->_p) return;
+    // Force buffer initialization by placing and removing a char in a buffer.
+    putc_unlocked(0, this->file_);
+    --this->file_->_p;
+    ++this->file_->_w;
+  }
+
+  auto get_read_buffer() const -> span<const char> {
+    return {reinterpret_cast<char*>(this->file_->_p),
+            to_unsigned(this->file_->_r)};
+  }
+
+  auto get_write_buffer() const -> span<char> {
+    return {reinterpret_cast<char*>(this->file_->_p),
+            to_unsigned(this->file_->_bf._base + this->file_->_bf._size -
+                        this->file_->_p)};
+  }
+
+  void advance_write_buffer(size_t size) {
+    this->file_->_p += size;
+    this->file_->_w -= size;
+  }
+
+  bool needs_flush() const {
+    if ((this->file_->_flags & line_buffered) == 0) return false;
+    return memchr(this->file_->_p + this->file_->_w, '\n',
+                  to_unsigned(-this->file_->_w));
+  }
+};
+
+// A fallback FILE wrapper.
+template <typename F> class fallback_file : public file_base<F> {
+ private:
+  char next_;  // The next unconsumed character in the buffer.
+  bool has_next_ = false;
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool { return false; }
+  auto needs_flush() const -> bool { return false; }
+  void init_buffer() {}
+
+  auto get_read_buffer() const -> span<const char> {
+    return {&next_, has_next_ ? 1u : 0u};
+  }
+
+  auto get_write_buffer() const -> span<char> { return {nullptr, 0}; }
+
+  void advance_write_buffer(size_t) {}
+
+  auto get() -> int {
+    has_next_ = false;
+    return file_base<F>::get();
+  }
+
+  void unget(char c) {
+    file_base<F>::unget(c);
+    next_ = c;
+    has_next_ = true;
+  }
+};
+
+#ifndef FMT_USE_FALLBACK_FILE
+#  define FMT_USE_FALLBACK_FILE 1
+#endif
+
+template <typename F,
+          FMT_ENABLE_IF(sizeof(F::_p) != 0 && !FMT_USE_FALLBACK_FILE)>
+auto get_file(F* f, int) -> apple_file<F> {
+  return f;
+}
+template <typename F,
+          FMT_ENABLE_IF(sizeof(F::_IO_read_ptr) != 0 && !FMT_USE_FALLBACK_FILE)>
+inline auto get_file(F* f, int) -> glibc_file<F> {
+  return f;
+}
+
+inline auto get_file(FILE* f, ...) -> fallback_file<FILE> { return f; }
+
+using file_ref = decltype(get_file(static_cast<FILE*>(nullptr), 0));
+
+template <typename F = FILE, typename Enable = void>
+class file_print_buffer : public buffer<char> {
+ public:
+  explicit file_print_buffer(F*) : buffer(nullptr, size_t()) {}
+};
+
+template <typename F>
+class file_print_buffer<F, enable_if_t<has_flockfile<F>::value>>
+    : public buffer<char> {
+ private:
+  file_ref file_;
+
+  static void grow(buffer<char>& base, size_t) {
+    auto& self = static_cast<file_print_buffer&>(base);
+    self.file_.advance_write_buffer(self.size());
+    if (self.file_.get_write_buffer().size == 0) self.file_.flush();
+    auto buf = self.file_.get_write_buffer();
+    FMT_ASSERT(buf.size > 0, "");
+    self.set(buf.data, buf.size);
+    self.clear();
+  }
+
+ public:
+  explicit file_print_buffer(F* f) : buffer(grow, size_t()), file_(f) {
+    flockfile(f);
+    file_.init_buffer();
+    auto buf = file_.get_write_buffer();
+    set(buf.data, buf.size);
+  }
+  ~file_print_buffer() {
+    file_.advance_write_buffer(size());
+    bool flush = file_.needs_flush();
+    F* f = file_;    // Make funlockfile depend on the template parameter F
+    funlockfile(f);  // for the system API detection to work.
+    if (flush) fflush(file_);
+  }
+};
+
+#if !defined(_WIN32) || defined(FMT_USE_WRITE_CONSOLE)
+FMT_FUNC auto write_console(int, string_view) -> bool { return false; }
+#else
 using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
 extern "C" __declspec(dllimport) int __stdcall WriteConsoleW(  //
     void*, const void*, dword, dword*, void*);
 
-FMT_FUNC bool write_console(std::FILE* f, string_view text) {
-  auto fd = _fileno(f);
-  if (_isatty(fd)) {
-    detail::utf8_to_utf16 u16(string_view(text.data(), text.size()));
-    auto written = detail::dword();
-    if (detail::WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)),
-                              u16.c_str(), static_cast<uint32_t>(u16.size()),
-                              &written, nullptr)) {
-      return true;
-    }
-  }
-  // We return false if the file descriptor was not TTY, or it was but
-  // SetConsoleW failed which can happen if the output has been redirected to
-  // NUL. In both cases when we return false, we should attempt to do regular
-  // write via fwrite or std::ostream::write.
-  return false;
+FMT_FUNC bool write_console(int fd, string_view text) {
+  auto u16 = utf8_to_utf16(text);
+  return WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)), u16.c_str(),
+                       static_cast<dword>(u16.size()), nullptr, nullptr) != 0;
 }
 #endif
 
-FMT_FUNC void print(std::FILE* f, string_view text) {
 #ifdef _WIN32
-  if (write_console(f, text)) return;
+// Print assuming legacy (non-Unicode) encoding.
+FMT_FUNC void vprint_mojibake(std::FILE* f, string_view fmt, format_args args,
+                              bool newline) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  if (newline) buffer.push_back('\n');
+  fwrite_fully(buffer.data(), buffer.size(), f);
+}
 #endif
-  detail::fwrite_fully(text.data(), 1, text.size(), f);
+
+FMT_FUNC void print(std::FILE* f, string_view text) {
+#if defined(_WIN32) && !defined(FMT_USE_WRITE_CONSOLE)
+  int fd = _fileno(f);
+  if (_isatty(fd)) {
+    std::fflush(f);
+    if (write_console(fd, text)) return;
+  }
+#endif
+  fwrite_fully(text.data(), text.size(), f);
 }
 }  // namespace detail
 
-FMT_FUNC void vprint(std::FILE* f, string_view format_str, format_args args) {
-  memory_buffer buffer;
-  detail::vformat_to(buffer, format_str, args);
+FMT_FUNC void vprint_buffered(std::FILE* f, string_view fmt, format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
   detail::print(f, {buffer.data(), buffer.size()});
 }
 
-#ifdef _WIN32
-// Print assuming legacy (non-Unicode) encoding.
-FMT_FUNC void detail::vprint_mojibake(std::FILE* f, string_view format_str,
-                                      format_args args) {
-  memory_buffer buffer;
-  detail::vformat_to(buffer, format_str,
-                     basic_format_args<buffer_context<char>>(args));
-  fwrite_fully(buffer.data(), 1, buffer.size(), f);
+FMT_FUNC void vprint(std::FILE* f, string_view fmt, format_args args) {
+  if (!detail::file_ref(f).is_buffered() || !detail::has_flockfile<>())
+    return vprint_buffered(f, fmt, args);
+  auto&& buffer = detail::file_print_buffer<>(f);
+  return detail::vformat_to(buffer, fmt, args);
+}
+
+FMT_FUNC void vprintln(std::FILE* f, string_view fmt, format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  buffer.push_back('\n');
+  detail::print(f, {buffer.data(), buffer.size()});
 }
-#endif
 
-FMT_FUNC void vprint(string_view format_str, format_args args) {
-  vprint(stdout, format_str, args);
+FMT_FUNC void vprint(string_view fmt, format_args args) {
+  vprint(stdout, fmt, args);
 }
 
 namespace detail {
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/format.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/format.h
index 8dda88727d5c..67f0ab739b0d 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/format.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/format.h
@@ -33,31 +33,65 @@
 #ifndef FMT_FORMAT_H_
 #define FMT_FORMAT_H_
 
-#include <cmath>             // std::signbit
-#include <cstdint>           // uint32_t
-#include <cstring>           // std::memcpy
-#include <initializer_list>  // std::initializer_list
-#include <limits>            // std::numeric_limits
-#include <memory>            // std::uninitialized_copy
-#include <stdexcept>         // std::runtime_error
-#include <system_error>      // std::system_error
-
-#ifdef __cpp_lib_bit_cast
-#  include <bit>  // std::bitcast
+#ifndef _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#  define _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#  define FMT_REMOVE_TRANSITIVE_INCLUDES
 #endif
 
-#include "core.h"
+#include "base.h"
+
+#ifndef FMT_MODULE
+#  include <cmath>             // std::signbit
+#  include <cstdint>           // uint32_t
+#  include <cstring>           // std::memcpy
+#  include <initializer_list>  // std::initializer_list
+#  include <limits>            // std::numeric_limits
+#  if defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI)
+// Workaround for pre gcc 5 libstdc++.
+#    include <memory>  // std::allocator_traits
+#  endif
+#  include <stdexcept>     // std::runtime_error
+#  include <string>        // std::string
+#  include <system_error>  // std::system_error
+
+// Checking FMT_CPLUSPLUS for warning suppression in MSVC.
+#  if FMT_HAS_INCLUDE(<bit>) && FMT_CPLUSPLUS > 201703L
+#    include <bit>  // std::bit_cast
+#  endif
+
+// libc++ supports string_view in pre-c++17.
+#  if FMT_HAS_INCLUDE(<string_view>) && \
+      (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
+#    include <string_view>
+#    define FMT_USE_STRING_VIEW
+#  endif
+#endif  // FMT_MODULE
 
-#if FMT_GCC_VERSION
-#  define FMT_GCC_VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#if defined __cpp_inline_variables && __cpp_inline_variables >= 201606L
+#  define FMT_INLINE_VARIABLE inline
 #else
-#  define FMT_GCC_VISIBILITY_HIDDEN
+#  define FMT_INLINE_VARIABLE
+#endif
+
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  if FMT_CPLUSPLUS >= 202002L
+#    if FMT_HAS_CPP_ATTRIBUTE(no_unique_address)
+#      define FMT_NO_UNIQUE_ADDRESS [[no_unique_address]]
+// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485).
+#    elif (FMT_MSC_VERSION >= 1929) && !FMT_CLANG_VERSION
+#      define FMT_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
+#    endif
+#  endif
+#endif
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  define FMT_NO_UNIQUE_ADDRESS
 #endif
 
-#ifdef __NVCC__
-#  define FMT_CUDA_VERSION (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__)
+// Visibility when compiled as a shared library/object.
+#if defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_SO_VISIBILITY(value) FMT_VISIBILITY(value)
 #else
-#  define FMT_CUDA_VERSION 0
+#  define FMT_SO_VISIBILITY(value)
 #endif
 
 #ifdef __has_builtin
@@ -72,11 +106,12 @@
 #  define FMT_NOINLINE
 #endif
 
-#if FMT_MSC_VERSION
-#  define FMT_MSC_DEFAULT = default
-#else
-#  define FMT_MSC_DEFAULT
-#endif
+namespace std {
+template <> struct iterator_traits<fmt::appender> {
+  using iterator_category = output_iterator_tag;
+  using value_type = char;
+};
+}  // namespace std
 
 #ifndef FMT_THROW
 #  if FMT_EXCEPTIONS
@@ -96,21 +131,11 @@ FMT_END_NAMESPACE
 #      define FMT_THROW(x) throw x
 #    endif
 #  else
-#    define FMT_THROW(x)               \
-      do {                             \
-        FMT_ASSERT(false, (x).what()); \
-      } while (false)
+#    define FMT_THROW(x) \
+      ::fmt::detail::assert_fail(__FILE__, __LINE__, (x).what())
 #  endif
 #endif
 
-#if FMT_EXCEPTIONS
-#  define FMT_TRY try
-#  define FMT_CATCH(x) catch (x)
-#else
-#  define FMT_TRY if (true)
-#  define FMT_CATCH(x) if (false)
-#endif
-
 #ifndef FMT_MAYBE_UNUSED
 #  if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused)
 #    define FMT_MAYBE_UNUSED [[maybe_unused]]
@@ -121,7 +146,10 @@ FMT_END_NAMESPACE
 
 #ifndef FMT_USE_USER_DEFINED_LITERALS
 // EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs.
-#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 407 || \
+//
+// GCC before 4.9 requires a space in `operator"" _a` which is invalid in later
+// compiler versions.
+#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 409 || \
        FMT_MSC_VERSION >= 1900) &&                                     \
       (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= /* UDL feature */ 480)
 #    define FMT_USE_USER_DEFINED_LITERALS 1
@@ -201,7 +229,8 @@ inline auto clzll(uint64_t x) -> int {
   _BitScanReverse64(&r, x);
 #  else
   // Scan the high 32 bits.
-  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32))) return 63 ^ (r + 32);
+  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
+    return 63 ^ static_cast<int>(r + 32);
   // Scan the low 32 bits.
   _BitScanReverse(&r, static_cast<uint32_t>(x));
 #  endif
@@ -241,6 +270,11 @@ FMT_END_NAMESPACE
 #endif
 
 FMT_BEGIN_NAMESPACE
+
+template <typename Char, typename Traits, typename Allocator>
+struct is_contiguous<std::basic_string<Char, Traits, Allocator>>
+    : std::true_type {};
+
 namespace detail {
 
 FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
@@ -250,49 +284,12 @@ FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
 #endif
 }
 
-template <typename CharT, CharT... C> struct string_literal {
-  static constexpr CharT value[sizeof...(C)] = {C...};
-  constexpr operator basic_string_view<CharT>() const {
-    return {value, sizeof...(C)};
-  }
-};
-
-#if FMT_CPLUSPLUS < 201703L
-template <typename CharT, CharT... C>
-constexpr CharT string_literal<CharT, C...>::value[sizeof...(C)];
+#if defined(FMT_USE_STRING_VIEW)
+template <typename Char> using std_string_view = std::basic_string_view<Char>;
+#else
+template <typename T> struct std_string_view {};
 #endif
 
-template <typename Streambuf> class formatbuf : public Streambuf {
- private:
-  using char_type = typename Streambuf::char_type;
-  using streamsize = decltype(std::declval<Streambuf>().sputn(nullptr, 0));
-  using int_type = typename Streambuf::int_type;
-  using traits_type = typename Streambuf::traits_type;
-
-  buffer<char_type>& buffer_;
-
- public:
-  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
-
- protected:
-  // The put area is always empty. This makes the implementation simpler and has
-  // the advantage that the streambuf and the buffer are always in sync and
-  // sputc never writes into uninitialized memory. A disadvantage is that each
-  // call to sputc always results in a (virtual) call to overflow. There is no
-  // disadvantage here for sputn since this always results in a call to xsputn.
-
-  auto overflow(int_type ch) -> int_type override {
-    if (!traits_type::eq_int_type(ch, traits_type::eof()))
-      buffer_.push_back(static_cast<char_type>(ch));
-    return ch;
-  }
-
-  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
-    buffer_.append(s, s + count);
-    return count;
-  }
-};
-
 // Implementation of std::bit_cast for pre-C++20.
 template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) == sizeof(From))>
 FMT_CONSTEXPR20 auto bit_cast(const From& from) -> To {
@@ -324,14 +321,12 @@ class uint128_fallback {
  private:
   uint64_t lo_, hi_;
 
-  friend uint128_fallback umul128(uint64_t x, uint64_t y) noexcept;
-
  public:
   constexpr uint128_fallback(uint64_t hi, uint64_t lo) : lo_(lo), hi_(hi) {}
   constexpr uint128_fallback(uint64_t value = 0) : lo_(value), hi_(0) {}
 
-  constexpr uint64_t high() const noexcept { return hi_; }
-  constexpr uint64_t low() const noexcept { return lo_; }
+  constexpr auto high() const noexcept -> uint64_t { return hi_; }
+  constexpr auto low() const noexcept -> uint64_t { return lo_; }
 
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
   constexpr explicit operator T() const {
@@ -360,6 +355,10 @@ class uint128_fallback {
       -> uint128_fallback {
     return {lhs.hi_ & rhs.hi_, lhs.lo_ & rhs.lo_};
   }
+  friend constexpr auto operator~(const uint128_fallback& n)
+      -> uint128_fallback {
+    return {~n.hi_, ~n.lo_};
+  }
   friend auto operator+(const uint128_fallback& lhs,
                         const uint128_fallback& rhs) -> uint128_fallback {
     auto result = uint128_fallback(lhs);
@@ -398,8 +397,12 @@ class uint128_fallback {
     lo_ = new_lo;
     hi_ = new_hi;
   }
+  FMT_CONSTEXPR void operator&=(uint128_fallback n) {
+    lo_ &= n.lo_;
+    hi_ &= n.hi_;
+  }
 
-  FMT_CONSTEXPR20 uint128_fallback& operator+=(uint64_t n) noexcept {
+  FMT_CONSTEXPR20 auto operator+=(uint64_t n) noexcept -> uint128_fallback& {
     if (is_constant_evaluated()) {
       lo_ += n;
       hi_ += (lo_ < n ? 1 : 0);
@@ -443,7 +446,8 @@ template <typename T> constexpr auto num_bits() -> int {
 }
 // std::numeric_limits<T>::digits may return 0 for 128-bit ints.
 template <> constexpr auto num_bits<int128_opt>() -> int { return 128; }
-template <> constexpr auto num_bits<uint128_t>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_opt>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_fallback>() -> int { return 128; }
 
 // A heterogeneous bit_cast used for converting 96-bit long double to uint128_t
 // and 128-bit pointers to uint128_fallback.
@@ -464,10 +468,34 @@ inline auto bit_cast(const From& from) -> To {
   return result;
 }
 
+template <typename UInt>
+FMT_CONSTEXPR20 inline auto countl_zero_fallback(UInt n) -> int {
+  int lz = 0;
+  constexpr UInt msb_mask = static_cast<UInt>(1) << (num_bits<UInt>() - 1);
+  for (; (n & msb_mask) == 0; n <<= 1) lz++;
+  return lz;
+}
+
+FMT_CONSTEXPR20 inline auto countl_zero(uint32_t n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated()) return FMT_BUILTIN_CLZ(n);
+#endif
+  return countl_zero_fallback(n);
+}
+
+FMT_CONSTEXPR20 inline auto countl_zero(uint64_t n) -> int {
+#ifdef FMT_BUILTIN_CLZLL
+  if (!is_constant_evaluated()) return FMT_BUILTIN_CLZLL(n);
+#endif
+  return countl_zero_fallback(n);
+}
+
 FMT_INLINE void assume(bool condition) {
   (void)condition;
 #if FMT_HAS_BUILTIN(__builtin_assume) && !FMT_ICC_VERSION
   __builtin_assume(condition);
+#elif FMT_GCC_VERSION
+  if (!condition) __builtin_unreachable();
 #endif
 }
 
@@ -486,37 +514,24 @@ inline auto get_data(Container& c) -> typename Container::value_type* {
   return c.data();
 }
 
-#if defined(_SECURE_SCL) && _SECURE_SCL
-// Make a checked iterator to avoid MSVC warnings.
-template <typename T> using checked_ptr = stdext::checked_array_iterator<T*>;
-template <typename T>
-constexpr auto make_checked(T* p, size_t size) -> checked_ptr<T> {
-  return {p, size};
-}
-#else
-template <typename T> using checked_ptr = T*;
-template <typename T> constexpr auto make_checked(T* p, size_t) -> T* {
-  return p;
-}
-#endif
-
 // Attempts to reserve space for n extra characters in the output range.
 // Returns a pointer to the reserved range or a reference to it.
-template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
+template <typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
+                            is_contiguous<typename OutputIt::container>::value)>
 #if FMT_CLANG_VERSION >= 307 && !FMT_ICC_VERSION
 __attribute__((no_sanitize("undefined")))
 #endif
 inline auto
-reserve(std::back_insert_iterator<Container> it, size_t n)
-    -> checked_ptr<typename Container::value_type> {
-  Container& c = get_container(it);
+reserve(OutputIt it, size_t n) -> typename OutputIt::value_type* {
+  auto& c = get_container(it);
   size_t size = c.size();
   c.resize(size + n);
-  return make_checked(get_data(c) + size, n);
+  return get_data(c) + size;
 }
 
 template <typename T>
-inline auto reserve(buffer_appender<T> it, size_t n) -> buffer_appender<T> {
+inline auto reserve(basic_appender<T> it, size_t n) -> basic_appender<T> {
   buffer<T>& buf = get_container(it);
   buf.try_reserve(buf.size() + n);
   return it;
@@ -535,18 +550,21 @@ template <typename T, typename OutputIt>
 constexpr auto to_pointer(OutputIt, size_t) -> T* {
   return nullptr;
 }
-template <typename T> auto to_pointer(buffer_appender<T> it, size_t n) -> T* {
+template <typename T> auto to_pointer(basic_appender<T> it, size_t n) -> T* {
   buffer<T>& buf = get_container(it);
   auto size = buf.size();
+  buf.try_reserve(size + n);
   if (buf.capacity() < size + n) return nullptr;
   buf.try_resize(size + n);
   return buf.data() + size;
 }
 
-template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
-inline auto base_iterator(std::back_insert_iterator<Container>& it,
-                          checked_ptr<typename Container::value_type>)
-    -> std::back_insert_iterator<Container> {
+template <typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
+                            is_contiguous<typename OutputIt::container>::value)>
+inline auto base_iterator(OutputIt it,
+                          typename OutputIt::container_type::value_type*)
+    -> OutputIt {
   return it;
 }
 
@@ -572,16 +590,10 @@ FMT_CONSTEXPR20 auto fill_n(T* out, Size count, char value) -> T* {
   return out + count;
 }
 
-#ifdef __cpp_char8_t
-using char8_type = char8_t;
-#else
-enum char8_type : unsigned char {};
-#endif
-
 template <typename OutChar, typename InputIt, typename OutputIt>
-FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
-                                                  OutputIt out) -> OutputIt {
-  return copy_str<OutChar>(begin, end, out);
+FMT_CONSTEXPR FMT_NOINLINE auto copy_noinline(InputIt begin, InputIt end,
+                                              OutputIt out) -> OutputIt {
+  return copy<OutChar>(begin, end, out);
 }
 
 // A public domain branchless UTF-8 decoder by Christopher Wellons:
@@ -608,7 +620,8 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
   constexpr const int shiftc[] = {0, 18, 12, 6, 0};
   constexpr const int shifte[] = {0, 6, 4, 2, 0};
 
-  int len = code_point_length_impl(*s);
+  int len = "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
+      [static_cast<unsigned char>(*s) >> 3];
   // Compute the pointer to the next character early so that the next
   // iteration can start working on the next character. Neither Clang
   // nor GCC figure out this reordering on their own.
@@ -637,7 +650,7 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
   return next;
 }
 
-constexpr uint32_t invalid_code_point = ~uint32_t();
+constexpr FMT_INLINE_VARIABLE uint32_t invalid_code_point = ~uint32_t();
 
 // Invokes f(cp, sv) for every code point cp in s with sv being the string view
 // corresponding to the code point. cp is invalid_code_point on error.
@@ -661,7 +674,7 @@ FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
   }
   if (auto num_chars_left = s.data() + s.size() - p) {
     char buf[2 * block_size - 1] = {};
-    copy_str<char>(p, p + num_chars_left, buf);
+    copy<char>(p, p + num_chars_left, buf);
     const char* buf_ptr = buf;
     do {
       auto end = decode(buf_ptr, p);
@@ -678,7 +691,7 @@ inline auto compute_width(basic_string_view<Char> s) -> size_t {
 }
 
 // Computes approximate display width of a UTF-8 string.
-FMT_CONSTEXPR inline size_t compute_width(string_view s) {
+FMT_CONSTEXPR inline auto compute_width(string_view s) -> size_t {
   size_t num_code_points = 0;
   // It is not a lambda for compatibility with C++14.
   struct count_code_points {
@@ -712,11 +725,6 @@ FMT_CONSTEXPR inline size_t compute_width(string_view s) {
   return num_code_points;
 }
 
-inline auto compute_width(basic_string_view<char8_type> s) -> size_t {
-  return compute_width(
-      string_view(reinterpret_cast<const char*>(s.data()), s.size()));
-}
-
 template <typename Char>
 inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
   size_t size = s.size();
@@ -725,18 +733,17 @@ inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
 
 // Calculates the index of the nth code point in a UTF-8 string.
 inline auto code_point_index(string_view s, size_t n) -> size_t {
-  const char* data = s.data();
-  size_t num_code_points = 0;
-  for (size_t i = 0, size = s.size(); i != size; ++i) {
-    if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i;
-  }
-  return s.size();
-}
-
-inline auto code_point_index(basic_string_view<char8_type> s, size_t n)
-    -> size_t {
-  return code_point_index(
-      string_view(reinterpret_cast<const char*>(s.data()), s.size()), n);
+  size_t result = s.size();
+  const char* begin = s.begin();
+  for_each_codepoint(s, [begin, &n, &result](uint32_t, string_view sv) {
+    if (n != 0) {
+      --n;
+      return true;
+    }
+    result = to_unsigned(sv.begin() - begin);
+    return false;
+  });
+  return result;
 }
 
 template <typename T> struct is_integral : std::is_integral<T> {};
@@ -754,18 +761,32 @@ using is_integer =
                   !std::is_same<T, char>::value &&
                   !std::is_same<T, wchar_t>::value>;
 
-#ifndef FMT_USE_FLOAT128
-#  ifdef __SIZEOF_FLOAT128__
-#    define FMT_USE_FLOAT128 1
-#  else
-#    define FMT_USE_FLOAT128 0
-#  endif
+#ifndef FMT_USE_FLOAT
+#  define FMT_USE_FLOAT 1
+#endif
+#ifndef FMT_USE_DOUBLE
+#  define FMT_USE_DOUBLE 1
+#endif
+#ifndef FMT_USE_LONG_DOUBLE
+#  define FMT_USE_LONG_DOUBLE 1
+#endif
+
+#if defined(FMT_USE_FLOAT128)
+// Use the provided definition.
+#elif FMT_CLANG_VERSION && FMT_HAS_INCLUDE(<quadmath.h>)
+#  define FMT_USE_FLOAT128 1
+#elif FMT_GCC_VERSION && defined(_GLIBCXX_USE_FLOAT128) && \
+    !defined(__STRICT_ANSI__)
+#  define FMT_USE_FLOAT128 1
+#else
+#  define FMT_USE_FLOAT128 0
 #endif
 #if FMT_USE_FLOAT128
 using float128 = __float128;
 #else
 using float128 = void;
 #endif
+
 template <typename T> using is_float128 = std::is_same<T, float128>;
 
 template <typename T>
@@ -784,61 +805,39 @@ using is_double_double = bool_constant<std::numeric_limits<T>::digits == 106>;
 #  define FMT_USE_FULL_CACHE_DRAGONBOX 0
 #endif
 
-template <typename T>
-template <typename U>
-void buffer<T>::append(const U* begin, const U* end) {
-  while (begin != end) {
-    auto count = to_unsigned(end - begin);
-    try_reserve(size_ + count);
-    auto free_cap = capacity_ - size_;
-    if (free_cap < count) count = free_cap;
-    std::uninitialized_copy_n(begin, count, make_checked(ptr_ + size_, count));
-    size_ += count;
-    begin += count;
-  }
-}
-
 template <typename T, typename Enable = void>
 struct is_locale : std::false_type {};
 template <typename T>
 struct is_locale<T, void_t<decltype(T::classic())>> : std::true_type {};
 }  // namespace detail
 
-FMT_MODULE_EXPORT_BEGIN
+FMT_BEGIN_EXPORT
 
 // The number of characters to store in the basic_memory_buffer object itself
 // to avoid dynamic memory allocation.
 enum { inline_buffer_size = 500 };
 
 /**
-  \rst
-  A dynamically growing memory buffer for trivially copyable/constructible types
-  with the first ``SIZE`` elements stored in the object itself.
-
-  You can use the ``memory_buffer`` type alias for ``char`` instead.
-
-  **Example**::
-
-     auto out = fmt::memory_buffer();
-     format_to(std::back_inserter(out), "The answer is {}.", 42);
-
-  This will append the following output to the ``out`` object:
-
-  .. code-block:: none
-
-     The answer is 42.
-
-  The output can be converted to an ``std::string`` with ``to_string(out)``.
-  \endrst
+ * A dynamically growing memory buffer for trivially copyable/constructible
+ * types with the first `SIZE` elements stored in the object itself. Most
+ * commonly used via the `memory_buffer` alias for `char`.
+ *
+ * **Example**:
+ *
+ *     auto out = fmt::memory_buffer();
+ *     fmt::format_to(std::back_inserter(out), "The answer is {}.", 42);
+ *
+ * This will append "The answer is 42." to `out`. The buffer content can be
+ * converted to `std::string` with `to_string(out)`.
  */
 template <typename T, size_t SIZE = inline_buffer_size,
           typename Allocator = std::allocator<T>>
-class basic_memory_buffer final : public detail::buffer<T> {
+class basic_memory_buffer : public detail::buffer<T> {
  private:
   T store_[SIZE];
 
-  // Don't inherit from Allocator avoid generating type_info for it.
-  Allocator alloc_;
+  // Don't inherit from Allocator to avoid generating type_info for it.
+  FMT_NO_UNIQUE_ADDRESS Allocator alloc_;
 
   // Deallocate memory allocated by the buffer.
   FMT_CONSTEXPR20 void deallocate() {
@@ -846,8 +845,29 @@ class basic_memory_buffer final : public detail::buffer<T> {
     if (data != store_) alloc_.deallocate(data, this->capacity());
   }
 
- protected:
-  FMT_CONSTEXPR20 void grow(size_t size) override;
+  static FMT_CONSTEXPR20 void grow(detail::buffer<T>& buf, size_t size) {
+    detail::abort_fuzzing_if(size > 5000);
+    auto& self = static_cast<basic_memory_buffer&>(buf);
+    const size_t max_size =
+        std::allocator_traits<Allocator>::max_size(self.alloc_);
+    size_t old_capacity = buf.capacity();
+    size_t new_capacity = old_capacity + old_capacity / 2;
+    if (size > new_capacity)
+      new_capacity = size;
+    else if (new_capacity > max_size)
+      new_capacity = size > max_size ? size : max_size;
+    T* old_data = buf.data();
+    T* new_data = self.alloc_.allocate(new_capacity);
+    // Suppress a bogus -Wstringop-overflow in gcc 13.1 (#3481).
+    detail::assume(buf.size() <= new_capacity);
+    // The following code doesn't throw, so the raw pointer above doesn't leak.
+    memcpy(new_data, old_data, buf.size() * sizeof(T));
+    self.set(new_data, new_capacity);
+    // deallocate must not throw according to the standard, but even if it does,
+    // the buffer already uses the new storage and will deallocate it in
+    // destructor.
+    if (old_data != self.store_) self.alloc_.deallocate(old_data, old_capacity);
+  }
 
  public:
   using value_type = T;
@@ -855,7 +875,7 @@ class basic_memory_buffer final : public detail::buffer<T> {
 
   FMT_CONSTEXPR20 explicit basic_memory_buffer(
       const Allocator& alloc = Allocator())
-      : alloc_(alloc) {
+      : detail::buffer<T>(grow), alloc_(alloc) {
     this->set(store_, SIZE);
     if (detail::is_constant_evaluated()) detail::fill_n(store_, SIZE, T());
   }
@@ -869,8 +889,7 @@ class basic_memory_buffer final : public detail::buffer<T> {
     size_t size = other.size(), capacity = other.capacity();
     if (data == other.store_) {
       this->set(store_, capacity);
-      detail::copy_str<T>(other.store_, other.store_ + size,
-                          detail::make_checked(store_, capacity));
+      detail::copy<T>(other.store_, other.store_ + size, store_);
     } else {
       this->set(data, capacity);
       // Set pointer to the inline array so that delete is not called
@@ -882,21 +901,14 @@ class basic_memory_buffer final : public detail::buffer<T> {
   }
 
  public:
-  /**
-    \rst
-    Constructs a :class:`fmt::basic_memory_buffer` object moving the content
-    of the other object to it.
-    \endrst
-   */
-  FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept {
+  /// Constructs a `basic_memory_buffer` object moving the content of the other
+  /// object to it.
+  FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept
+      : detail::buffer<T>(grow) {
     move(other);
   }
 
-  /**
-    \rst
-    Moves the content of the other ``basic_memory_buffer`` object to this one.
-    \endrst
-   */
+  /// Moves the content of the other `basic_memory_buffer` object to this one.
   auto operator=(basic_memory_buffer&& other) noexcept -> basic_memory_buffer& {
     FMT_ASSERT(this != &other, "");
     deallocate();
@@ -907,16 +919,13 @@ class basic_memory_buffer final : public detail::buffer<T> {
   // Returns a copy of the allocator associated with this buffer.
   auto get_allocator() const -> Allocator { return alloc_; }
 
-  /**
-    Resizes the buffer to contain *count* elements. If T is a POD type new
-    elements may not be initialized.
-   */
+  /// Resizes the buffer to contain `count` elements. If T is a POD type new
+  /// elements may not be initialized.
   FMT_CONSTEXPR20 void resize(size_t count) { this->try_resize(count); }
 
-  /** Increases the buffer capacity to *new_capacity*. */
+  /// Increases the buffer capacity to `new_capacity`.
   void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
 
-  // Directly append data into the buffer
   using detail::buffer<T>::append;
   template <typename ContiguousRange>
   void append(const ContiguousRange& range) {
@@ -924,61 +933,37 @@ class basic_memory_buffer final : public detail::buffer<T> {
   }
 };
 
-template <typename T, size_t SIZE, typename Allocator>
-FMT_CONSTEXPR20 void basic_memory_buffer<T, SIZE, Allocator>::grow(
-    size_t size) {
-  detail::abort_fuzzing_if(size > 5000);
-  const size_t max_size = std::allocator_traits<Allocator>::max_size(alloc_);
-  size_t old_capacity = this->capacity();
-  size_t new_capacity = old_capacity + old_capacity / 2;
-  if (size > new_capacity)
-    new_capacity = size;
-  else if (new_capacity > max_size)
-    new_capacity = size > max_size ? size : max_size;
-  T* old_data = this->data();
-  T* new_data =
-      std::allocator_traits<Allocator>::allocate(alloc_, new_capacity);
-  // The following code doesn't throw, so the raw pointer above doesn't leak.
-  std::uninitialized_copy(old_data, old_data + this->size(),
-                          detail::make_checked(new_data, new_capacity));
-  this->set(new_data, new_capacity);
-  // deallocate must not throw according to the standard, but even if it does,
-  // the buffer already uses the new storage and will deallocate it in
-  // destructor.
-  if (old_data != store_) alloc_.deallocate(old_data, old_capacity);
-}
-
 using memory_buffer = basic_memory_buffer<char>;
 
 template <typename T, size_t SIZE, typename Allocator>
 struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
 };
 
+FMT_END_EXPORT
 namespace detail {
-#ifdef _WIN32
-FMT_API bool write_console(std::FILE* f, string_view text);
-#endif
+FMT_API auto write_console(int fd, string_view text) -> bool;
 FMT_API void print(std::FILE*, string_view);
 }  // namespace detail
 
-/** An error reported from a formatting function. */
-FMT_CLASS_API
-class FMT_API format_error : public std::runtime_error {
+FMT_BEGIN_EXPORT
+
+// Suppress a misleading warning in older versions of clang.
+#if FMT_CLANG_VERSION
+#  pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+/// An error reported from a formatting function.
+class FMT_SO_VISIBILITY("default") format_error : public std::runtime_error {
  public:
   using std::runtime_error::runtime_error;
-  format_error(const format_error&) = default;
-  format_error& operator=(const format_error&) = default;
-  format_error(format_error&&) = default;
-  format_error& operator=(format_error&&) = default;
-  ~format_error() noexcept override FMT_MSC_DEFAULT;
 };
 
 namespace detail_exported {
 #if FMT_USE_NONTYPE_TEMPLATE_ARGS
 template <typename Char, size_t N> struct fixed_string {
   constexpr fixed_string(const Char (&str)[N]) {
-    detail::copy_str<Char, const Char*, Char*>(static_cast<const Char*>(str),
-                                               str + N, data);
+    detail::copy<Char, const Char*, Char*>(static_cast<const Char*>(str),
+                                           str + N, data);
   }
   Char data[N] = {};
 };
@@ -993,12 +978,57 @@ constexpr auto compile_string_to_view(const Char (&s)[N])
   return {s, N - (std::char_traits<Char>::to_int_type(s[N - 1]) == 0 ? 1 : 0)};
 }
 template <typename Char>
-constexpr auto compile_string_to_view(detail::std_string_view<Char> s)
+constexpr auto compile_string_to_view(basic_string_view<Char> s)
     -> basic_string_view<Char> {
-  return {s.data(), s.size()};
+  return s;
 }
 }  // namespace detail_exported
 
+// A generic formatting context with custom output iterator and character
+// (code unit) support. Char is the format string code unit type which can be
+// different from OutputIt::value_type.
+template <typename OutputIt, typename Char> class generic_context {
+ private:
+  OutputIt out_;
+  basic_format_args<generic_context> args_;
+  detail::locale_ref loc_;
+
+ public:
+  using char_type = Char;
+  using iterator = OutputIt;
+  using parse_context_type = basic_format_parse_context<Char>;
+  template <typename T> using formatter_type = formatter<T, Char>;
+
+  constexpr generic_context(OutputIt out,
+                            basic_format_args<generic_context> ctx_args,
+                            detail::locale_ref loc = {})
+      : out_(out), args_(ctx_args), loc_(loc) {}
+  generic_context(generic_context&&) = default;
+  generic_context(const generic_context&) = delete;
+  void operator=(const generic_context&) = delete;
+
+  constexpr auto arg(int id) const -> basic_format_arg<generic_context> {
+    return args_.get(id);
+  }
+  auto arg(basic_string_view<Char> name) -> basic_format_arg<generic_context> {
+    return args_.get(name);
+  }
+  FMT_CONSTEXPR auto arg_id(basic_string_view<Char> name) -> int {
+    return args_.get_id(name);
+  }
+  auto args() const -> const basic_format_args<generic_context>& {
+    return args_;
+  }
+
+  FMT_CONSTEXPR auto out() -> iterator { return out_; }
+
+  void advance_to(iterator it) {
+    if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
+  }
+
+  FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
+};
+
 class loc_value {
  private:
   basic_format_arg<format_context> value_;
@@ -1011,7 +1041,7 @@ class loc_value {
   loc_value(T) {}
 
   template <typename Visitor> auto visit(Visitor&& vis) -> decltype(vis(0)) {
-    return visit_format_arg(vis, value_);
+    return value_.visit(vis);
   }
 };
 
@@ -1044,7 +1074,9 @@ template <typename Locale> class format_facet : public Locale::facet {
   }
 };
 
-FMT_BEGIN_DETAIL_NAMESPACE
+FMT_END_EXPORT
+
+namespace detail {
 
 // Returns true if value is negative, false otherwise.
 // Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
@@ -1075,13 +1107,13 @@ using uint32_or_64_or_128_t =
 template <typename T>
 using uint64_or_128_t = conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>;
 
-#define FMT_POWERS_OF_10(factor)                                             \
-  factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \
-      (factor)*1000000, (factor)*10000000, (factor)*100000000,               \
-      (factor)*1000000000
+#define FMT_POWERS_OF_10(factor)                                  \
+  factor * 10, (factor) * 100, (factor) * 1000, (factor) * 10000, \
+      (factor) * 100000, (factor) * 1000000, (factor) * 10000000, \
+      (factor) * 100000000, (factor) * 1000000000
 
 // Converts value in the range [0, 100) to a string.
-constexpr const char* digits2(size_t value) {
+constexpr auto digits2(size_t value) -> const char* {
   // GCC generates slightly better code when value is pointer-size.
   return &"0001020304050607080910111213141516171819"
          "2021222324252627282930313233343536373839"
@@ -1091,11 +1123,11 @@ constexpr const char* digits2(size_t value) {
 }
 
 // Sign is a template parameter to workaround a bug in gcc 4.8.
-template <typename Char, typename Sign> constexpr Char sign(Sign s) {
+template <typename Char, typename Sign> constexpr auto sign(Sign s) -> Char {
 #if !FMT_GCC_VERSION || FMT_GCC_VERSION >= 604
   static_assert(std::is_same<Sign, sign_t>::value, "");
 #endif
-  return static_cast<Char>("\0-+ "[s]);
+  return static_cast<char>(((' ' << 24) | ('+' << 16) | ('-' << 8)) >> (s * 8));
 }
 
 template <typename T> FMT_CONSTEXPR auto count_digits_fallback(T n) -> int {
@@ -1143,9 +1175,7 @@ inline auto do_count_digits(uint64_t n) -> int {
 // except for n == 0 in which case count_digits returns 1.
 FMT_CONSTEXPR20 inline auto count_digits(uint64_t n) -> int {
 #ifdef FMT_BUILTIN_CLZLL
-  if (!is_constant_evaluated()) {
-    return do_count_digits(n);
-  }
+  if (!is_constant_evaluated()) return do_count_digits(n);
 #endif
   return count_digits_fallback(n);
 }
@@ -1173,7 +1203,7 @@ FMT_CONSTEXPR auto count_digits(UInt n) -> int {
 FMT_INLINE auto do_count_digits(uint32_t n) -> int {
 // An optimization by Kendall Willets from https://bit.ly/3uOIQrB.
 // This increments the upper 32 bits (log10(T) - 1) when >= T is added.
-#  define FMT_INC(T) (((sizeof(#  T) - 1ull) << 32) - T)
+#  define FMT_INC(T) (((sizeof(#T) - 1ull) << 32) - T)
   static constexpr uint64_t table[] = {
       FMT_INC(0),          FMT_INC(0),          FMT_INC(0),           // 8
       FMT_INC(10),         FMT_INC(10),         FMT_INC(10),          // 64
@@ -1291,7 +1321,7 @@ FMT_CONSTEXPR inline auto format_decimal(Iterator out, UInt value, int size)
   // Buffer is large enough to hold all digits (digits10 + 1).
   Char buffer[digits10<UInt>() + 1] = {};
   auto end = format_decimal(buffer, value, size).end;
-  return {out, detail::copy_str_noinline<Char>(buffer, end, out)};
+  return {out, detail::copy_noinline<Char>(buffer, end, out)};
 }
 
 template <unsigned BASE_BITS, typename Char, typename UInt>
@@ -1309,16 +1339,16 @@ FMT_CONSTEXPR auto format_uint(Char* buffer, UInt value, int num_digits,
 }
 
 template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
-inline auto format_uint(It out, UInt value, int num_digits, bool upper = false)
-    -> It {
+FMT_CONSTEXPR inline auto format_uint(It out, UInt value, int num_digits,
+                                      bool upper = false) -> It {
   if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
     format_uint<BASE_BITS>(ptr, value, num_digits, upper);
     return out;
   }
   // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
-  char buffer[num_bits<UInt>() / BASE_BITS + 1];
+  char buffer[num_bits<UInt>() / BASE_BITS + 1] = {};
   format_uint<BASE_BITS>(buffer, value, num_digits, upper);
-  return detail::copy_str_noinline<Char>(buffer, buffer + num_digits, out);
+  return detail::copy_noinline<Char>(buffer, buffer + num_digits, out);
 }
 
 // A converter from UTF-8 to UTF-16.
@@ -1334,7 +1364,140 @@ class utf8_to_utf16 {
   auto str() const -> std::wstring { return {&buffer_[0], size()}; }
 };
 
+enum class to_utf8_error_policy { abort, replace };
+
+// A converter from UTF-16/UTF-32 (host endian) to UTF-8.
+template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
+ private:
+  Buffer buffer_;
+
+ public:
+  to_utf8() {}
+  explicit to_utf8(basic_string_view<WChar> s,
+                   to_utf8_error_policy policy = to_utf8_error_policy::abort) {
+    static_assert(sizeof(WChar) == 2 || sizeof(WChar) == 4,
+                  "Expect utf16 or utf32");
+    if (!convert(s, policy))
+      FMT_THROW(std::runtime_error(sizeof(WChar) == 2 ? "invalid utf16"
+                                                      : "invalid utf32"));
+  }
+  operator string_view() const { return string_view(&buffer_[0], size()); }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const char* { return &buffer_[0]; }
+  auto str() const -> std::string { return std::string(&buffer_[0], size()); }
+
+  // Performs conversion returning a bool instead of throwing exception on
+  // conversion error. This method may still throw in case of memory allocation
+  // error.
+  auto convert(basic_string_view<WChar> s,
+               to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
+    if (!convert(buffer_, s, policy)) return false;
+    buffer_.push_back(0);
+    return true;
+  }
+  static auto convert(Buffer& buf, basic_string_view<WChar> s,
+                      to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
+    for (auto p = s.begin(); p != s.end(); ++p) {
+      uint32_t c = static_cast<uint32_t>(*p);
+      if (sizeof(WChar) == 2 && c >= 0xd800 && c <= 0xdfff) {
+        // Handle a surrogate pair.
+        ++p;
+        if (p == s.end() || (c & 0xfc00) != 0xd800 || (*p & 0xfc00) != 0xdc00) {
+          if (policy == to_utf8_error_policy::abort) return false;
+          buf.append(string_view("\xEF\xBF\xBD"));
+          --p;
+        } else {
+          c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
+        }
+      } else if (c < 0x80) {
+        buf.push_back(static_cast<char>(c));
+      } else if (c < 0x800) {
+        buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
+        buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if (c >= 0x10000 && c <= 0x10ffff) {
+        buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
+inline auto umul128(uint64_t x, uint64_t y) noexcept -> uint128_fallback {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
+  return {static_cast<uint64_t>(p >> 64), static_cast<uint64_t>(p)};
+#elif defined(_MSC_VER) && defined(_M_X64)
+  auto hi = uint64_t();
+  auto lo = _umul128(x, y, &hi);
+  return {hi, lo};
+#else
+  const uint64_t mask = static_cast<uint64_t>(max_value<uint32_t>());
+
+  uint64_t a = x >> 32;
+  uint64_t b = x & mask;
+  uint64_t c = y >> 32;
+  uint64_t d = y & mask;
+
+  uint64_t ac = a * c;
+  uint64_t bc = b * c;
+  uint64_t ad = a * d;
+  uint64_t bd = b * d;
+
+  uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask);
+
+  return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32),
+          (intermediate << 32) + (bd & mask)};
+#endif
+}
+
 namespace dragonbox {
+// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from
+// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1.
+inline auto floor_log10_pow2(int e) noexcept -> int {
+  FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
+  static_assert((-1 >> 1) == -1, "right shift is not arithmetic");
+  return (e * 315653) >> 20;
+}
+
+inline auto floor_log2_pow10(int e) noexcept -> int {
+  FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
+  return (e * 1741647) >> 19;
+}
+
+// Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
+inline auto umul128_upper64(uint64_t x, uint64_t y) noexcept -> uint64_t {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
+  return static_cast<uint64_t>(p >> 64);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  return __umulh(x, y);
+#else
+  return umul128(x, y).high();
+#endif
+}
+
+// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+inline auto umul192_upper128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
+  uint128_fallback r = umul128(x, y.high());
+  r += umul128_upper64(x, y.low());
+  return r;
+}
+
+FMT_API auto get_cached_power(int k) noexcept -> uint128_fallback;
 
 // Type-specific information that Dragonbox uses.
 template <typename T, typename Enable = void> struct float_info;
@@ -1358,7 +1521,7 @@ template <> struct float_info<double> {
   static const int big_divisor = 1000;
   static const int small_divisor = 100;
   static const int min_k = -292;
-  static const int max_k = 326;
+  static const int max_k = 341;
   static const int shorter_interval_tie_lower_threshold = -77;
   static const int shorter_interval_tie_upper_threshold = -77;
 };
@@ -1388,14 +1551,14 @@ template <typename T> FMT_API auto to_decimal(T x) noexcept -> decimal_fp<T>;
 }  // namespace dragonbox
 
 // Returns true iff Float has the implicit bit which is not stored.
-template <typename Float> constexpr bool has_implicit_bit() {
+template <typename Float> constexpr auto has_implicit_bit() -> bool {
   // An 80-bit FP number has a 64-bit significand an no implicit bit.
   return std::numeric_limits<Float>::digits != 64;
 }
 
 // Returns the number of significand bits stored in Float. The implicit bit is
 // not counted since it is not stored.
-template <typename Float> constexpr int num_significand_bits() {
+template <typename Float> constexpr auto num_significand_bits() -> int {
   // std::numeric_limits may not support __float128.
   return is_float128<Float>() ? 112
                               : (std::numeric_limits<Float>::digits -
@@ -1405,8 +1568,8 @@ template <typename Float> constexpr int num_significand_bits() {
 template <typename Float>
 constexpr auto exponent_mask() ->
     typename dragonbox::float_info<Float>::carrier_uint {
-  using uint = typename dragonbox::float_info<Float>::carrier_uint;
-  return ((uint(1) << dragonbox::float_info<Float>::exponent_bits) - 1)
+  using float_uint = typename dragonbox::float_info<Float>::carrier_uint;
+  return ((float_uint(1) << dragonbox::float_info<Float>::exponent_bits) - 1)
          << num_significand_bits<Float>();
 }
 template <typename Float> constexpr auto exponent_bias() -> int {
@@ -1488,7 +1651,7 @@ using fp = basic_fp<unsigned long long>;
 
 // Normalizes the value converted from double and multiplied by (1 << SHIFT).
 template <int SHIFT = 0, typename F>
-FMT_CONSTEXPR basic_fp<F> normalize(basic_fp<F> value) {
+FMT_CONSTEXPR auto normalize(basic_fp<F> value) -> basic_fp<F> {
   // Handle subnormals.
   const auto implicit_bit = F(1) << num_significand_bits<double>();
   const auto shifted_implicit_bit = implicit_bit << SHIFT;
@@ -1505,7 +1668,7 @@ FMT_CONSTEXPR basic_fp<F> normalize(basic_fp<F> value) {
 }
 
 // Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
-FMT_CONSTEXPR inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
+FMT_CONSTEXPR inline auto multiply(uint64_t lhs, uint64_t rhs) -> uint64_t {
 #if FMT_USE_INT128
   auto product = static_cast<__uint128_t>(lhs) * rhs;
   auto f = static_cast<uint64_t>(product >> 64);
@@ -1522,188 +1685,36 @@ FMT_CONSTEXPR inline uint64_t multiply(uint64_t lhs, uint64_t rhs) {
 #endif
 }
 
-FMT_CONSTEXPR inline fp operator*(fp x, fp y) {
+FMT_CONSTEXPR inline auto operator*(fp x, fp y) -> fp {
   return {multiply(x.f, y.f), x.e + y.e + 64};
 }
 
-template <typename T = void> struct basic_data {
-  // Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
-  // These are generated by support/compute-powers.py.
-  static constexpr uint64_t pow10_significands[87] = {
-      0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76,
-      0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df,
-      0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c,
-      0x8dd01fad907ffc3c, 0xd3515c2831559a83, 0x9d71ac8fada6c9b5,
-      0xea9c227723ee8bcb, 0xaecc49914078536d, 0x823c12795db6ce57,
-      0xc21094364dfb5637, 0x9096ea6f3848984f, 0xd77485cb25823ac7,
-      0xa086cfcd97bf97f4, 0xef340a98172aace5, 0xb23867fb2a35b28e,
-      0x84c8d4dfd2c63f3b, 0xc5dd44271ad3cdba, 0x936b9fcebb25c996,
-      0xdbac6c247d62a584, 0xa3ab66580d5fdaf6, 0xf3e2f893dec3f126,
-      0xb5b5ada8aaff80b8, 0x87625f056c7c4a8b, 0xc9bcff6034c13053,
-      0x964e858c91ba2655, 0xdff9772470297ebd, 0xa6dfbd9fb8e5b88f,
-      0xf8a95fcf88747d94, 0xb94470938fa89bcf, 0x8a08f0f8bf0f156b,
-      0xcdb02555653131b6, 0x993fe2c6d07b7fac, 0xe45c10c42a2b3b06,
-      0xaa242499697392d3, 0xfd87b5f28300ca0e, 0xbce5086492111aeb,
-      0x8cbccc096f5088cc, 0xd1b71758e219652c, 0x9c40000000000000,
-      0xe8d4a51000000000, 0xad78ebc5ac620000, 0x813f3978f8940984,
-      0xc097ce7bc90715b3, 0x8f7e32ce7bea5c70, 0xd5d238a4abe98068,
-      0x9f4f2726179a2245, 0xed63a231d4c4fb27, 0xb0de65388cc8ada8,
-      0x83c7088e1aab65db, 0xc45d1df942711d9a, 0x924d692ca61be758,
-      0xda01ee641a708dea, 0xa26da3999aef774a, 0xf209787bb47d6b85,
-      0xb454e4a179dd1877, 0x865b86925b9bc5c2, 0xc83553c5c8965d3d,
-      0x952ab45cfa97a0b3, 0xde469fbd99a05fe3, 0xa59bc234db398c25,
-      0xf6c69a72a3989f5c, 0xb7dcbf5354e9bece, 0x88fcf317f22241e2,
-      0xcc20ce9bd35c78a5, 0x98165af37b2153df, 0xe2a0b5dc971f303a,
-      0xa8d9d1535ce3b396, 0xfb9b7cd9a4a7443c, 0xbb764c4ca7a44410,
-      0x8bab8eefb6409c1a, 0xd01fef10a657842c, 0x9b10a4e5e9913129,
-      0xe7109bfba19c0c9d, 0xac2820d9623bf429, 0x80444b5e7aa7cf85,
-      0xbf21e44003acdd2d, 0x8e679c2f5e44ff8f, 0xd433179d9c8cb841,
-      0x9e19db92b4e31ba9, 0xeb96bf6ebadf77d9, 0xaf87023b9bf0ee6b,
-  };
-
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wnarrowing"
-#endif
-  // Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding
-  // to significands above.
-  static constexpr int16_t pow10_exponents[87] = {
-      -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954,
-      -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,  -688, -661,
-      -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,  -422,  -396, -369,
-      -343,  -316,  -289,  -263,  -236,  -210,  -183,  -157,  -130,  -103, -77,
-      -50,   -24,   3,     30,    56,    83,    109,   136,   162,   189,  216,
-      242,   269,   295,   322,   348,   375,   402,   428,   455,   481,  508,
-      534,   561,   588,   614,   641,   667,   694,   720,   747,   774,  800,
-      827,   853,   880,   907,   933,   960,   986,   1013,  1039,  1066};
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-#  pragma GCC diagnostic pop
-#endif
-
-  static constexpr uint64_t power_of_10_64[20] = {
-      1, FMT_POWERS_OF_10(1ULL), FMT_POWERS_OF_10(1000000000ULL),
-      10000000000000000000ULL};
-};
-
-#if FMT_CPLUSPLUS < 201703L
-template <typename T> constexpr uint64_t basic_data<T>::pow10_significands[];
-template <typename T> constexpr int16_t basic_data<T>::pow10_exponents[];
-template <typename T> constexpr uint64_t basic_data<T>::power_of_10_64[];
-#endif
-
-// This is a struct rather than an alias to avoid shadowing warnings in gcc.
-struct data : basic_data<> {};
-
-// Returns a cached power of 10 `c_k = c_k.f * pow(2, c_k.e)` such that its
-// (binary) exponent satisfies `min_exponent <= c_k.e <= min_exponent + 28`.
-FMT_CONSTEXPR inline fp get_cached_power(int min_exponent,
-                                         int& pow10_exponent) {
-  const int shift = 32;
-  // log10(2) = 0x0.4d104d427de7fbcc...
-  const int64_t significand = 0x4d104d427de7fbcc;
-  int index = static_cast<int>(
-      ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) +
-       ((int64_t(1) << shift) - 1))  // ceil
-      >> 32                          // arithmetic shift
-  );
-  // Decimal exponent of the first (smallest) cached power of 10.
-  const int first_dec_exp = -348;
-  // Difference between 2 consecutive decimal exponents in cached powers of 10.
-  const int dec_exp_step = 8;
-  index = (index - first_dec_exp - 1) / dec_exp_step + 1;
-  pow10_exponent = first_dec_exp + index * dec_exp_step;
-  // Using *(x + index) instead of x[index] avoids an issue with some compilers
-  // using the EDG frontend (e.g. nvhpc/22.3 in C++17 mode).
-  return {*(data::pow10_significands + index),
-          *(data::pow10_exponents + index)};
-}
-
-#ifndef _MSC_VER
-#  define FMT_SNPRINTF snprintf
-#else
-FMT_API auto fmt_snprintf(char* buf, size_t size, const char* fmt, ...) -> int;
-#  define FMT_SNPRINTF fmt_snprintf
-#endif  // _MSC_VER
-
-// Formats a floating-point number with snprintf using the hexfloat format.
-template <typename T>
-auto snprintf_float(T value, int precision, float_specs specs,
-                    buffer<char>& buf) -> int {
-  // Buffer capacity must be non-zero, otherwise MSVC's vsnprintf_s will fail.
-  FMT_ASSERT(buf.capacity() > buf.size(), "empty buffer");
-  FMT_ASSERT(specs.format == float_format::hex, "");
-  static_assert(!std::is_same<T, float>::value, "");
-
-  // Build the format string.
-  char format[7];  // The longest format is "%#.*Le".
-  char* format_ptr = format;
-  *format_ptr++ = '%';
-  if (specs.showpoint) *format_ptr++ = '#';
-  if (precision >= 0) {
-    *format_ptr++ = '.';
-    *format_ptr++ = '*';
-  }
-  if (std::is_same<T, long double>()) *format_ptr++ = 'L';
-  *format_ptr++ = specs.upper ? 'A' : 'a';
-  *format_ptr = '\0';
-
-  // Format using snprintf.
-  auto offset = buf.size();
-  for (;;) {
-    auto begin = buf.data() + offset;
-    auto capacity = buf.capacity() - offset;
-    abort_fuzzing_if(precision > 100000);
-    // Suppress the warning about a nonliteral format string.
-    // Cannot use auto because of a bug in MinGW (#1532).
-    int (*snprintf_ptr)(char*, size_t, const char*, ...) = FMT_SNPRINTF;
-    int result = precision >= 0
-                     ? snprintf_ptr(begin, capacity, format, precision, value)
-                     : snprintf_ptr(begin, capacity, format, value);
-    if (result < 0) {
-      // The buffer will grow exponentially.
-      buf.try_reserve(buf.capacity() + 1);
-      continue;
-    }
-    auto size = to_unsigned(result);
-    // Size equal to capacity means that the last character was truncated.
-    if (size < capacity) {
-      buf.try_resize(size + offset);
-      return 0;
-    }
-    buf.try_reserve(size + offset + 1);  // Add 1 for the terminating '\0'.
-  }
-}
-
-template <typename T>
+template <typename T, bool doublish = num_bits<T>() == num_bits<double>()>
 using convert_float_result =
-    conditional_t<std::is_same<T, float>::value ||
-                      std::numeric_limits<T>::digits ==
-                          std::numeric_limits<double>::digits,
-                  double, T>;
+    conditional_t<std::is_same<T, float>::value || doublish, double, T>;
 
 template <typename T>
 constexpr auto convert_float(T value) -> convert_float_result<T> {
   return static_cast<convert_float_result<T>>(value);
 }
 
-template <typename OutputIt, typename Char>
-FMT_NOINLINE FMT_CONSTEXPR auto fill(OutputIt it, size_t n,
-                                     const fill_t<Char>& fill) -> OutputIt {
+template <typename Char, typename OutputIt>
+FMT_NOINLINE FMT_CONSTEXPR auto fill(OutputIt it, size_t n, const fill_t& fill)
+    -> OutputIt {
   auto fill_size = fill.size();
-  if (fill_size == 1) return detail::fill_n(it, n, fill[0]);
-  auto data = fill.data();
-  for (size_t i = 0; i < n; ++i)
-    it = copy_str<Char>(data, data + fill_size, it);
+  if (fill_size == 1) return detail::fill_n(it, n, fill.template get<Char>());
+  if (const Char* data = fill.template data<Char>()) {
+    for (size_t i = 0; i < n; ++i) it = copy<Char>(data, data + fill_size, it);
+  }
   return it;
 }
 
 // Writes the output of f, padded according to format specifications in specs.
 // size: output size in code units.
 // width: output display width in (terminal) column positions.
-template <align::type align = align::left, typename OutputIt, typename Char,
+template <typename Char, align::type align = align::left, typename OutputIt,
           typename F>
-FMT_CONSTEXPR auto write_padded(OutputIt out,
-                                const basic_format_specs<Char>& specs,
+FMT_CONSTEXPR auto write_padded(OutputIt out, const format_specs& specs,
                                 size_t size, size_t width, F&& f) -> OutputIt {
   static_assert(align == align::left || align == align::right, "");
   unsigned spec_width = to_unsigned(specs.width);
@@ -1714,33 +1725,32 @@ FMT_CONSTEXPR auto write_padded(OutputIt out,
   size_t left_padding = padding >> shifts[specs.align];
   size_t right_padding = padding - left_padding;
   auto it = reserve(out, size + padding * specs.fill.size());
-  if (left_padding != 0) it = fill(it, left_padding, specs.fill);
+  if (left_padding != 0) it = fill<Char>(it, left_padding, specs.fill);
   it = f(it);
-  if (right_padding != 0) it = fill(it, right_padding, specs.fill);
+  if (right_padding != 0) it = fill<Char>(it, right_padding, specs.fill);
   return base_iterator(out, it);
 }
 
-template <align::type align = align::left, typename OutputIt, typename Char,
+template <typename Char, align::type align = align::left, typename OutputIt,
           typename F>
-constexpr auto write_padded(OutputIt out, const basic_format_specs<Char>& specs,
+constexpr auto write_padded(OutputIt out, const format_specs& specs,
                             size_t size, F&& f) -> OutputIt {
-  return write_padded<align>(out, specs, size, size, f);
+  return write_padded<Char, align>(out, specs, size, size, f);
 }
 
-template <align::type align = align::left, typename Char, typename OutputIt>
+template <typename Char, align::type align = align::left, typename OutputIt>
 FMT_CONSTEXPR auto write_bytes(OutputIt out, string_view bytes,
-                               const basic_format_specs<Char>& specs)
-    -> OutputIt {
-  return write_padded<align>(
+                               const format_specs& specs = {}) -> OutputIt {
+  return write_padded<Char, align>(
       out, specs, bytes.size(), [bytes](reserve_iterator<OutputIt> it) {
         const char* data = bytes.data();
-        return copy_str<Char>(data, data + bytes.size(), it);
+        return copy<Char>(data, data + bytes.size(), it);
       });
 }
 
 template <typename Char, typename OutputIt, typename UIntPtr>
-auto write_ptr(OutputIt out, UIntPtr value,
-               const basic_format_specs<Char>* specs) -> OutputIt {
+auto write_ptr(OutputIt out, UIntPtr value, const format_specs* specs)
+    -> OutputIt {
   int num_digits = count_digits<4>(value);
   auto size = to_unsigned(num_digits) + size_t(2);
   auto write = [=](reserve_iterator<OutputIt> it) {
@@ -1748,7 +1758,7 @@ auto write_ptr(OutputIt out, UIntPtr value,
     *it++ = static_cast<Char>('x');
     return format_uint<4, Char>(it, value, num_digits);
   };
-  return specs ? write_padded<align::right>(out, *specs, size, write)
+  return specs ? write_padded<Char, align::right>(out, *specs, size, write)
                : base_iterator(out, write(reserve(out, size)));
 }
 
@@ -1766,17 +1776,11 @@ template <typename Char> struct find_escape_result {
   uint32_t cp;
 };
 
-template <typename Char>
-using make_unsigned_char =
-    typename conditional_t<std::is_integral<Char>::value,
-                           std::make_unsigned<Char>,
-                           type_identity<uint32_t>>::type;
-
 template <typename Char>
 auto find_escape(const Char* begin, const Char* end)
     -> find_escape_result<Char> {
   for (; begin != end; ++begin) {
-    uint32_t cp = static_cast<make_unsigned_char<Char>>(*begin);
+    uint32_t cp = static_cast<unsigned_char<Char>>(*begin);
     if (const_check(sizeof(Char) == 1) && cp >= 0x80) continue;
     if (needs_escape(cp)) return {begin, begin + 1, cp};
   }
@@ -1785,7 +1789,7 @@ auto find_escape(const Char* begin, const Char* end)
 
 inline auto find_escape(const char* begin, const char* end)
     -> find_escape_result<char> {
-  if (!is_utf8()) return find_escape<char>(begin, end);
+  if (!use_utf8()) return find_escape<char>(begin, end);
   auto result = find_escape_result<char>{end, nullptr, 0};
   for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
                      [&](uint32_t cp, string_view sv) {
@@ -1802,7 +1806,7 @@ inline auto find_escape(const char* begin, const char* end)
   [] {                                                                        \
     /* Use the hidden visibility as a workaround for a GCC bug (#1973). */    \
     /* Use a macro-like name to avoid shadowing warnings. */                  \
-    struct FMT_GCC_VISIBILITY_HIDDEN FMT_COMPILE_STRING : base {              \
+    struct FMT_VISIBILITY("hidden") FMT_COMPILE_STRING : base {               \
       using char_type FMT_MAYBE_UNUSED = fmt::remove_cvref_t<decltype(s[0])>; \
       FMT_MAYBE_UNUSED FMT_CONSTEXPR explicit                                 \
       operator fmt::basic_string_view<char_type>() const {                    \
@@ -1813,14 +1817,12 @@ inline auto find_escape(const char* begin, const char* end)
   }()
 
 /**
-  \rst
-  Constructs a compile-time format string from a string literal *s*.
-
-  **Example**::
-
-    // A compile-time error because 'd' is an invalid specifier for strings.
-    std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
-  \endrst
+ * Constructs a compile-time format string from a string literal `s`.
+ *
+ * **Example**:
+ *
+ *     // A compile-time error because 'd' is an invalid specifier for strings.
+ *     std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
  */
 #define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::detail::compile_string, )
 
@@ -1831,7 +1833,7 @@ auto write_codepoint(OutputIt out, char prefix, uint32_t cp) -> OutputIt {
   Char buf[width];
   fill_n(buf, width, static_cast<Char>('0'));
   format_uint<4>(buf, cp, width);
-  return copy_str<Char>(buf, buf + width, out);
+  return copy<Char>(buf, buf + width, out);
 }
 
 template <typename OutputIt, typename Char>
@@ -1859,17 +1861,11 @@ auto write_escaped_cp(OutputIt out, const find_escape_result<Char>& escape)
     *out++ = static_cast<Char>('\\');
     break;
   default:
-    if (is_utf8()) {
-      if (escape.cp < 0x100) {
-        return write_codepoint<2, Char>(out, 'x', escape.cp);
-      }
-      if (escape.cp < 0x10000) {
-        return write_codepoint<4, Char>(out, 'u', escape.cp);
-      }
-      if (escape.cp < 0x110000) {
-        return write_codepoint<8, Char>(out, 'U', escape.cp);
-      }
-    }
+    if (escape.cp < 0x100) return write_codepoint<2, Char>(out, 'x', escape.cp);
+    if (escape.cp < 0x10000)
+      return write_codepoint<4, Char>(out, 'u', escape.cp);
+    if (escape.cp < 0x110000)
+      return write_codepoint<8, Char>(out, 'U', escape.cp);
     for (Char escape_char : basic_string_view<Char>(
              escape.begin, to_unsigned(escape.end - escape.begin))) {
       out = write_codepoint<2, Char>(out, 'x',
@@ -1888,7 +1884,7 @@ auto write_escaped_string(OutputIt out, basic_string_view<Char> str)
   auto begin = str.begin(), end = str.end();
   do {
     auto escape = find_escape(begin, end);
-    out = copy_str<Char>(begin, escape.begin, out);
+    out = copy<Char>(begin, escape.begin, out);
     begin = escape.end;
     if (!begin) break;
     out = write_escaped_cp<OutputIt, Char>(out, escape);
@@ -1899,11 +1895,13 @@ auto write_escaped_string(OutputIt out, basic_string_view<Char> str)
 
 template <typename Char, typename OutputIt>
 auto write_escaped_char(OutputIt out, Char v) -> OutputIt {
+  Char v_array[1] = {v};
   *out++ = static_cast<Char>('\'');
   if ((needs_escape(static_cast<uint32_t>(v)) && v != static_cast<Char>('"')) ||
       v == static_cast<Char>('\'')) {
-    out = write_escaped_cp(
-        out, find_escape_result<Char>{&v, &v + 1, static_cast<uint32_t>(v)});
+    out = write_escaped_cp(out,
+                           find_escape_result<Char>{v_array, v_array + 1,
+                                                    static_cast<uint32_t>(v)});
   } else {
     *out++ = v;
   }
@@ -1913,22 +1911,23 @@ auto write_escaped_char(OutputIt out, Char v) -> OutputIt {
 
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write_char(OutputIt out, Char value,
-                              const basic_format_specs<Char>& specs)
-    -> OutputIt {
+                              const format_specs& specs) -> OutputIt {
   bool is_debug = specs.type == presentation_type::debug;
-  return write_padded(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
+  return write_padded<Char>(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
     if (is_debug) return write_escaped_char(it, value);
     *it++ = value;
     return it;
   });
 }
 template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, Char value,
-                         const basic_format_specs<Char>& specs,
+FMT_CONSTEXPR auto write(OutputIt out, Char value, const format_specs& specs,
                          locale_ref loc = {}) -> OutputIt {
+  // char is formatted as unsigned char for consistency across platforms.
+  using unsigned_type =
+      conditional_t<std::is_same<Char, char>::value, unsigned char, unsigned>;
   return check_char_specs(specs)
-             ? write_char(out, value, specs)
-             : write(out, static_cast<int>(value), specs, loc);
+             ? write_char<Char>(out, value, specs)
+             : write<Char>(out, static_cast<unsigned_type>(value), specs, loc);
 }
 
 // Data for write_int that doesn't depend on output iterator type. It is used to
@@ -1938,7 +1937,7 @@ template <typename Char> struct write_int_data {
   size_t padding;
 
   FMT_CONSTEXPR write_int_data(int num_digits, unsigned prefix,
-                               const basic_format_specs<Char>& specs)
+                               const format_specs& specs)
       : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) {
     if (specs.align == align::numeric) {
       auto width = to_unsigned(specs.width);
@@ -1957,10 +1956,10 @@ template <typename Char> struct write_int_data {
 //   <left-padding><prefix><numeric-padding><digits><right-padding>
 // where <digits> are written by write_digits(it).
 // prefix contains chars in three lower bytes and the size in the fourth byte.
-template <typename OutputIt, typename Char, typename W>
+template <typename Char, typename OutputIt, typename W>
 FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, int num_digits,
                                         unsigned prefix,
-                                        const basic_format_specs<Char>& specs,
+                                        const format_specs& specs,
                                         W write_digits) -> OutputIt {
   // Slightly faster check for specs.width == 0 && specs.precision == -1.
   if ((specs.width | (specs.precision + 1)) == 0) {
@@ -1972,7 +1971,7 @@ FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, int num_digits,
     return base_iterator(out, write_digits(it));
   }
   auto data = write_int_data<Char>(num_digits, prefix, specs);
-  return write_padded<align::right>(
+  return write_padded<Char, align::right>(
       out, specs, data.size, [=](reserve_iterator<OutputIt> it) {
         for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
           *it++ = static_cast<Char>(p & 0xff);
@@ -1990,10 +1989,10 @@ template <typename Char> class digit_grouping {
     std::string::const_iterator group;
     int pos;
   };
-  next_state initial_state() const { return {grouping_.begin(), 0}; }
+  auto initial_state() const -> next_state { return {grouping_.begin(), 0}; }
 
   // Returns the next digit group separator position.
-  int next(next_state& state) const {
+  auto next(next_state& state) const -> int {
     if (thousands_sep_.empty()) return max_value<int>();
     if (state.group == grouping_.end()) return state.pos += grouping_.back();
     if (*state.group <= 0 || *state.group == max_value<char>())
@@ -2012,9 +2011,9 @@ template <typename Char> class digit_grouping {
   digit_grouping(std::string grouping, std::basic_string<Char> sep)
       : grouping_(std::move(grouping)), thousands_sep_(std::move(sep)) {}
 
-  bool has_separator() const { return !thousands_sep_.empty(); }
+  auto has_separator() const -> bool { return !thousands_sep_.empty(); }
 
-  int count_separators(int num_digits) const {
+  auto count_separators(int num_digits) const -> int {
     int count = 0;
     auto state = initial_state();
     while (num_digits > next(state)) ++count;
@@ -2023,7 +2022,7 @@ template <typename Char> class digit_grouping {
 
   // Applies grouping to digits and write the output to out.
   template <typename Out, typename C>
-  Out apply(Out out, basic_string_view<C> digits) const {
+  auto apply(Out out, basic_string_view<C> digits) const -> Out {
     auto num_digits = static_cast<int>(digits.size());
     auto separators = basic_memory_buffer<int>();
     separators.push_back(0);
@@ -2035,9 +2034,8 @@ template <typename Char> class digit_grouping {
     for (int i = 0, sep_index = static_cast<int>(separators.size() - 1);
          i < num_digits; ++i) {
       if (num_digits - i == separators[sep_index]) {
-        out =
-            copy_str<Char>(thousands_sep_.data(),
-                           thousands_sep_.data() + thousands_sep_.size(), out);
+        out = copy<Char>(thousands_sep_.data(),
+                         thousands_sep_.data() + thousands_sep_.size(), out);
         --sep_index;
       }
       *out++ = static_cast<Char>(digits[to_unsigned(i)]);
@@ -2046,41 +2044,71 @@ template <typename Char> class digit_grouping {
   }
 };
 
+FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
+  prefix |= prefix != 0 ? value << 8 : value;
+  prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
+}
+
 // Writes a decimal integer with digit grouping.
 template <typename OutputIt, typename UInt, typename Char>
 auto write_int(OutputIt out, UInt value, unsigned prefix,
-               const basic_format_specs<Char>& specs,
-               const digit_grouping<Char>& grouping) -> OutputIt {
+               const format_specs& specs, const digit_grouping<Char>& grouping)
+    -> OutputIt {
   static_assert(std::is_same<uint64_or_128_t<UInt>, UInt>::value, "");
-  int num_digits = count_digits(value);
-  char digits[40];
-  format_decimal(digits, value, num_digits);
-  unsigned size = to_unsigned((prefix != 0 ? 1 : 0) + num_digits +
-                              grouping.count_separators(num_digits));
-  return write_padded<align::right>(
+  int num_digits = 0;
+  auto buffer = memory_buffer();
+  switch (specs.type) {
+  default:
+    FMT_ASSERT(false, "");
+    FMT_FALLTHROUGH;
+  case presentation_type::none:
+  case presentation_type::dec:
+    num_digits = count_digits(value);
+    format_decimal<char>(appender(buffer), value, num_digits);
+    break;
+  case presentation_type::hex:
+    if (specs.alt)
+      prefix_append(prefix, unsigned(specs.upper ? 'X' : 'x') << 8 | '0');
+    num_digits = count_digits<4>(value);
+    format_uint<4, char>(appender(buffer), value, num_digits, specs.upper);
+    break;
+  case presentation_type::oct:
+    num_digits = count_digits<3>(value);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    if (specs.alt && specs.precision <= num_digits && value != 0)
+      prefix_append(prefix, '0');
+    format_uint<3, char>(appender(buffer), value, num_digits);
+    break;
+  case presentation_type::bin:
+    if (specs.alt)
+      prefix_append(prefix, unsigned(specs.upper ? 'B' : 'b') << 8 | '0');
+    num_digits = count_digits<1>(value);
+    format_uint<1, char>(appender(buffer), value, num_digits);
+    break;
+  case presentation_type::chr:
+    return write_char<Char>(out, static_cast<Char>(value), specs);
+  }
+
+  unsigned size = (prefix != 0 ? prefix >> 24 : 0) + to_unsigned(num_digits) +
+                  to_unsigned(grouping.count_separators(num_digits));
+  return write_padded<Char, align::right>(
       out, specs, size, size, [&](reserve_iterator<OutputIt> it) {
-        if (prefix != 0) {
-          char sign = static_cast<char>(prefix);
-          *it++ = static_cast<Char>(sign);
-        }
-        return grouping.apply(it, string_view(digits, to_unsigned(num_digits)));
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        return grouping.apply(it, string_view(buffer.data(), buffer.size()));
       });
 }
 
 // Writes a localized value.
 FMT_API auto write_loc(appender out, loc_value value, const format_specs& specs,
                        locale_ref loc) -> bool;
-template <typename OutputIt, typename Char>
-inline auto write_loc(OutputIt, loc_value, const basic_format_specs<Char>&,
-                      locale_ref) -> bool {
+template <typename OutputIt>
+inline auto write_loc(OutputIt, loc_value, const format_specs&, locale_ref)
+    -> bool {
   return false;
 }
 
-FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
-  prefix |= prefix != 0 ? value << 8 : value;
-  prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
-}
-
 template <typename UInt> struct write_int_arg {
   UInt abs_value;
   unsigned prefix;
@@ -2103,8 +2131,8 @@ FMT_CONSTEXPR auto make_write_int_arg(T value, sign_t sign)
 }
 
 template <typename Char = char> struct loc_writer {
-  buffer_appender<Char> out;
-  const basic_format_specs<Char>& specs;
+  basic_appender<Char> out;
+  const format_specs& specs;
   std::basic_string<Char> sep;
   std::string grouping;
   std::basic_string<Char> decimal_point;
@@ -2117,97 +2145,94 @@ template <typename Char = char> struct loc_writer {
     return true;
   }
 
-  template <typename T, FMT_ENABLE_IF(is_floating_point<T>::value)>
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
   auto operator()(T) -> bool {
     return false;
   }
-
-  auto operator()(...) -> bool { return false; }
 };
 
 template <typename Char, typename OutputIt, typename T>
 FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, write_int_arg<T> arg,
-                                        const basic_format_specs<Char>& specs,
-                                        locale_ref) -> OutputIt {
+                                        const format_specs& specs, locale_ref)
+    -> OutputIt {
   static_assert(std::is_same<T, uint32_or_64_or_128_t<T>>::value, "");
   auto abs_value = arg.abs_value;
   auto prefix = arg.prefix;
   switch (specs.type) {
+  default:
+    FMT_ASSERT(false, "");
+    FMT_FALLTHROUGH;
   case presentation_type::none:
   case presentation_type::dec: {
-    auto num_digits = count_digits(abs_value);
-    return write_int(
+    int num_digits = count_digits(abs_value);
+    return write_int<Char>(
         out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
           return format_decimal<Char>(it, abs_value, num_digits).end;
         });
   }
-  case presentation_type::hex_lower:
-  case presentation_type::hex_upper: {
-    bool upper = specs.type == presentation_type::hex_upper;
+  case presentation_type::hex: {
     if (specs.alt)
-      prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
+      prefix_append(prefix, unsigned(specs.upper ? 'X' : 'x') << 8 | '0');
     int num_digits = count_digits<4>(abs_value);
-    return write_int(
+    return write_int<Char>(
         out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
-          return format_uint<4, Char>(it, abs_value, num_digits, upper);
+          return format_uint<4, Char>(it, abs_value, num_digits, specs.upper);
         });
   }
-  case presentation_type::bin_lower:
-  case presentation_type::bin_upper: {
-    bool upper = specs.type == presentation_type::bin_upper;
-    if (specs.alt)
-      prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
-    int num_digits = count_digits<1>(abs_value);
-    return write_int(out, num_digits, prefix, specs,
-                     [=](reserve_iterator<OutputIt> it) {
-                       return format_uint<1, Char>(it, abs_value, num_digits);
-                     });
-  }
   case presentation_type::oct: {
     int num_digits = count_digits<3>(abs_value);
     // Octal prefix '0' is counted as a digit, so only add it if precision
     // is not greater than the number of digits.
     if (specs.alt && specs.precision <= num_digits && abs_value != 0)
       prefix_append(prefix, '0');
-    return write_int(out, num_digits, prefix, specs,
-                     [=](reserve_iterator<OutputIt> it) {
-                       return format_uint<3, Char>(it, abs_value, num_digits);
-                     });
+    return write_int<Char>(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_uint<3, Char>(it, abs_value, num_digits);
+        });
+  }
+  case presentation_type::bin: {
+    if (specs.alt)
+      prefix_append(prefix, unsigned(specs.upper ? 'B' : 'b') << 8 | '0');
+    int num_digits = count_digits<1>(abs_value);
+    return write_int<Char>(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_uint<1, Char>(it, abs_value, num_digits);
+        });
   }
   case presentation_type::chr:
-    return write_char(out, static_cast<Char>(abs_value), specs);
-  default:
-    throw_format_error("invalid type specifier");
+    return write_char<Char>(out, static_cast<Char>(abs_value), specs);
   }
-  return out;
 }
 template <typename Char, typename OutputIt, typename T>
-FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(
-    OutputIt out, write_int_arg<T> arg, const basic_format_specs<Char>& specs,
-    locale_ref loc) -> OutputIt {
-  return write_int(out, arg, specs, loc);
+FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(OutputIt out,
+                                                   write_int_arg<T> arg,
+                                                   const format_specs& specs,
+                                                   locale_ref loc) -> OutputIt {
+  return write_int<Char>(out, arg, specs, loc);
 }
-template <typename Char, typename OutputIt, typename T,
+template <typename Char, typename T,
           FMT_ENABLE_IF(is_integral<T>::value &&
                         !std::is_same<T, bool>::value &&
-                        std::is_same<OutputIt, buffer_appender<Char>>::value)>
-FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
-                                    const basic_format_specs<Char>& specs,
-                                    locale_ref loc) -> OutputIt {
+                        !std::is_same<T, Char>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(basic_appender<Char> out, T value,
+                                    const format_specs& specs, locale_ref loc)
+    -> basic_appender<Char> {
   if (specs.localized && write_loc(out, value, specs, loc)) return out;
-  return write_int_noinline(out, make_write_int_arg(value, specs.sign), specs,
-                            loc);
+  return write_int_noinline<Char>(out, make_write_int_arg(value, specs.sign),
+                                  specs, loc);
 }
 // An inlined version of write used in format string compilation.
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_integral<T>::value &&
                         !std::is_same<T, bool>::value &&
-                        !std::is_same<OutputIt, buffer_appender<Char>>::value)>
+                        !std::is_same<T, Char>::value &&
+                        !std::is_same<OutputIt, basic_appender<Char>>::value)>
 FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
-                                    const basic_format_specs<Char>& specs,
-                                    locale_ref loc) -> OutputIt {
+                                    const format_specs& specs, locale_ref loc)
+    -> OutputIt {
   if (specs.localized && write_loc(out, value, specs, loc)) return out;
-  return write_int(out, make_write_int_arg(value, specs.sign), specs, loc);
+  return write_int<Char>(out, make_write_int_arg(value, specs.sign), specs,
+                         loc);
 }
 
 // An output iterator that counts the number of objects written to it and
@@ -2229,63 +2254,64 @@ class counting_iterator {
 
   FMT_CONSTEXPR counting_iterator() : count_(0) {}
 
-  FMT_CONSTEXPR size_t count() const { return count_; }
+  FMT_CONSTEXPR auto count() const -> size_t { return count_; }
 
-  FMT_CONSTEXPR counting_iterator& operator++() {
+  FMT_CONSTEXPR auto operator++() -> counting_iterator& {
     ++count_;
     return *this;
   }
-  FMT_CONSTEXPR counting_iterator operator++(int) {
+  FMT_CONSTEXPR auto operator++(int) -> counting_iterator {
     auto it = *this;
     ++*this;
     return it;
   }
 
-  FMT_CONSTEXPR friend counting_iterator operator+(counting_iterator it,
-                                                   difference_type n) {
+  FMT_CONSTEXPR friend auto operator+(counting_iterator it, difference_type n)
+      -> counting_iterator {
     it.count_ += static_cast<size_t>(n);
     return it;
   }
 
-  FMT_CONSTEXPR value_type operator*() const { return {}; }
+  FMT_CONSTEXPR auto operator*() const -> value_type { return {}; }
 };
 
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
-                         const basic_format_specs<Char>& specs) -> OutputIt {
+                         const format_specs& specs) -> OutputIt {
   auto data = s.data();
   auto size = s.size();
   if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
     size = code_point_index(s, to_unsigned(specs.precision));
   bool is_debug = specs.type == presentation_type::debug;
   size_t width = 0;
+
+  if (is_debug) size = write_escaped_string(counting_iterator{}, s).count();
+
   if (specs.width != 0) {
     if (is_debug)
-      width = write_escaped_string(counting_iterator{}, s).count();
+      width = size;
     else
       width = compute_width(basic_string_view<Char>(data, size));
   }
-  return write_padded(out, specs, size, width,
-                      [=](reserve_iterator<OutputIt> it) {
-                        if (is_debug) return write_escaped_string(it, s);
-                        return copy_str<Char>(data, data + size, it);
-                      });
+  return write_padded<Char>(out, specs, size, width,
+                            [=](reserve_iterator<OutputIt> it) {
+                              if (is_debug) return write_escaped_string(it, s);
+                              return copy<Char>(data, data + size, it);
+                            });
 }
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write(OutputIt out,
                          basic_string_view<type_identity_t<Char>> s,
-                         const basic_format_specs<Char>& specs, locale_ref)
-    -> OutputIt {
-  check_string_type_spec(specs.type);
-  return write(out, s, specs);
+                         const format_specs& specs, locale_ref) -> OutputIt {
+  return write<Char>(out, s, specs);
 }
 template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, const Char* s,
-                         const basic_format_specs<Char>& specs, locale_ref)
-    -> OutputIt {
-  return check_cstring_type_spec(specs.type)
-             ? write(out, basic_string_view<Char>(s), specs, {})
-             : write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
+FMT_CONSTEXPR auto write(OutputIt out, const Char* s, const format_specs& specs,
+                         locale_ref) -> OutputIt {
+  if (specs.type == presentation_type::pointer)
+    return write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
+  if (!s) report_error("string pointer is null");
+  return write<Char>(out, basic_string_view<Char>(s), specs, {});
 }
 
 template <typename Char, typename OutputIt, typename T,
@@ -2299,34 +2325,118 @@ FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
   if (negative) abs_value = ~abs_value + 1;
   int num_digits = count_digits(abs_value);
   auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
-  auto it = reserve(out, size);
-  if (auto ptr = to_pointer<Char>(it, size)) {
+  if (auto ptr = to_pointer<Char>(out, size)) {
     if (negative) *ptr++ = static_cast<Char>('-');
     format_decimal<Char>(ptr, abs_value, num_digits);
     return out;
   }
-  if (negative) *it++ = static_cast<Char>('-');
-  it = format_decimal<Char>(it, abs_value, num_digits).end;
-  return base_iterator(out, it);
+  if (negative) *out++ = static_cast<Char>('-');
+  return format_decimal<Char>(out, abs_value, num_digits).end;
+}
+
+// DEPRECATED!
+template <typename Char>
+FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end,
+                               format_specs& specs) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  auto align = align::none;
+  auto p = begin + code_point_length(begin);
+  if (end - p <= 0) p = begin;
+  for (;;) {
+    switch (to_ascii(*p)) {
+    case '<':
+      align = align::left;
+      break;
+    case '>':
+      align = align::right;
+      break;
+    case '^':
+      align = align::center;
+      break;
+    }
+    if (align != align::none) {
+      if (p != begin) {
+        auto c = *begin;
+        if (c == '}') return begin;
+        if (c == '{') {
+          report_error("invalid fill character '{'");
+          return begin;
+        }
+        specs.fill = basic_string_view<Char>(begin, to_unsigned(p - begin));
+        begin = p + 1;
+      } else {
+        ++begin;
+      }
+      break;
+    } else if (p == begin) {
+      break;
+    }
+    p = begin;
+  }
+  specs.align = align;
+  return begin;
+}
+
+// A floating-point presentation format.
+enum class float_format : unsigned char {
+  general,  // General: exponent notation or fixed point based on magnitude.
+  exp,      // Exponent notation with the default precision of 6, e.g. 1.2e-3.
+  fixed     // Fixed point with the default precision of 6, e.g. 0.0012.
+};
+
+struct float_specs {
+  int precision;
+  float_format format : 8;
+  sign_t sign : 8;
+  bool locale : 1;
+  bool binary32 : 1;
+  bool showpoint : 1;
+};
+
+// DEPRECATED!
+FMT_CONSTEXPR inline auto parse_float_type_spec(const format_specs& specs)
+    -> float_specs {
+  auto result = float_specs();
+  result.showpoint = specs.alt;
+  result.locale = specs.localized;
+  switch (specs.type) {
+  default:
+    FMT_FALLTHROUGH;
+  case presentation_type::none:
+    result.format = float_format::general;
+    break;
+  case presentation_type::exp:
+    result.format = float_format::exp;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::fixed:
+    result.format = float_format::fixed;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::general:
+    result.format = float_format::general;
+    break;
+  }
+  return result;
 }
 
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isnan,
-                                     basic_format_specs<Char> specs,
-                                     const float_specs& fspecs) -> OutputIt {
+                                     format_specs specs, sign_t sign)
+    -> OutputIt {
   auto str =
-      isnan ? (fspecs.upper ? "NAN" : "nan") : (fspecs.upper ? "INF" : "inf");
+      isnan ? (specs.upper ? "NAN" : "nan") : (specs.upper ? "INF" : "inf");
   constexpr size_t str_size = 3;
-  auto sign = fspecs.sign;
   auto size = str_size + (sign ? 1 : 0);
   // Replace '0'-padding with space for non-finite values.
   const bool is_zero_fill =
-      specs.fill.size() == 1 && *specs.fill.data() == static_cast<Char>('0');
-  if (is_zero_fill) specs.fill[0] = static_cast<Char>(' ');
-  return write_padded(out, specs, size, [=](reserve_iterator<OutputIt> it) {
-    if (sign) *it++ = detail::sign<Char>(sign);
-    return copy_str<Char>(str, str + str_size, it);
-  });
+      specs.fill.size() == 1 && specs.fill.template get<Char>() == '0';
+  if (is_zero_fill) specs.fill = ' ';
+  return write_padded<Char>(out, specs, size,
+                            [=](reserve_iterator<OutputIt> it) {
+                              if (sign) *it++ = detail::sign<Char>(sign);
+                              return copy<Char>(str, str + str_size, it);
+                            });
 }
 
 // A decimal floating-point number significand * pow(10, exp).
@@ -2347,7 +2457,7 @@ inline auto get_significand_size(const dragonbox::decimal_fp<T>& f) -> int {
 template <typename Char, typename OutputIt>
 constexpr auto write_significand(OutputIt out, const char* significand,
                                  int significand_size) -> OutputIt {
-  return copy_str<Char>(significand, significand + significand_size, out);
+  return copy<Char>(significand, significand + significand_size, out);
 }
 template <typename Char, typename OutputIt, typename UInt>
 inline auto write_significand(OutputIt out, UInt significand,
@@ -2400,19 +2510,19 @@ inline auto write_significand(OutputIt out, UInt significand,
   Char buffer[digits10<UInt>() + 2];
   auto end = write_significand(buffer, significand, significand_size,
                                integral_size, decimal_point);
-  return detail::copy_str_noinline<Char>(buffer, end, out);
+  return detail::copy_noinline<Char>(buffer, end, out);
 }
 
 template <typename OutputIt, typename Char>
 FMT_CONSTEXPR auto write_significand(OutputIt out, const char* significand,
                                      int significand_size, int integral_size,
                                      Char decimal_point) -> OutputIt {
-  out = detail::copy_str_noinline<Char>(significand,
-                                        significand + integral_size, out);
+  out = detail::copy_noinline<Char>(significand, significand + integral_size,
+                                    out);
   if (!decimal_point) return out;
   *out++ = decimal_point;
-  return detail::copy_str_noinline<Char>(significand + integral_size,
-                                         significand + significand_size, out);
+  return detail::copy_noinline<Char>(significand + integral_size,
+                                     significand + significand_size, out);
 }
 
 template <typename OutputIt, typename Char, typename T, typename Grouping>
@@ -2425,18 +2535,18 @@ FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
                              decimal_point);
   }
   auto buffer = basic_memory_buffer<Char>();
-  write_significand(buffer_appender<Char>(buffer), significand,
-                    significand_size, integral_size, decimal_point);
+  write_significand(basic_appender<Char>(buffer), significand, significand_size,
+                    integral_size, decimal_point);
   grouping.apply(
       out, basic_string_view<Char>(buffer.data(), to_unsigned(integral_size)));
-  return detail::copy_str_noinline<Char>(buffer.data() + integral_size,
-                                         buffer.end(), out);
+  return detail::copy_noinline<Char>(buffer.data() + integral_size,
+                                     buffer.end(), out);
 }
 
-template <typename OutputIt, typename DecimalFP, typename Char,
+template <typename Char, typename OutputIt, typename DecimalFP,
           typename Grouping = digit_grouping<Char>>
 FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
-                                    const basic_format_specs<Char>& specs,
+                                    const format_specs& specs,
                                     float_specs fspecs, locale_ref loc)
     -> OutputIt {
   auto significand = f.significand;
@@ -2473,7 +2583,7 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
     if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3;
 
     size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
-    char exp_char = fspecs.upper ? 'E' : 'e';
+    char exp_char = specs.upper ? 'E' : 'e';
     auto write = [=](iterator it) {
       if (sign) *it++ = detail::sign<Char>(sign);
       // Insert a decimal point after the first digit and add an exponent.
@@ -2483,8 +2593,9 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
       *it++ = static_cast<Char>(exp_char);
       return write_exponent<Char>(output_exp, it);
     };
-    return specs.width > 0 ? write_padded<align::right>(out, specs, size, write)
-                           : base_iterator(out, write(reserve(out, size)));
+    return specs.width > 0
+               ? write_padded<Char, align::right>(out, specs, size, write)
+               : base_iterator(out, write(reserve(out, size)));
   }
 
   int exp = f.exponent + significand_size;
@@ -2495,12 +2606,12 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
     abort_fuzzing_if(num_zeros > 5000);
     if (fspecs.showpoint) {
       ++size;
-      if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 1;
+      if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 0;
       if (num_zeros > 0) size += to_unsigned(num_zeros);
     }
     auto grouping = Grouping(loc, fspecs.locale);
     size += to_unsigned(grouping.count_separators(exp));
-    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+    return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
       if (sign) *it++ = detail::sign<Char>(sign);
       it = write_significand<Char>(it, significand, significand_size,
                                    f.exponent, grouping);
@@ -2513,8 +2624,8 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
     int num_zeros = fspecs.showpoint ? fspecs.precision - significand_size : 0;
     size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0);
     auto grouping = Grouping(loc, fspecs.locale);
-    size += to_unsigned(grouping.count_separators(significand_size));
-    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+    size += to_unsigned(grouping.count_separators(exp));
+    return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
       if (sign) *it++ = detail::sign<Char>(sign);
       it = write_significand(it, significand, significand_size, exp,
                              decimal_point, grouping);
@@ -2529,7 +2640,7 @@ FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
   }
   bool pointy = num_zeros != 0 || significand_size != 0 || fspecs.showpoint;
   size += 1 + (pointy ? 1 : 0) + to_unsigned(num_zeros);
-  return write_padded<align::right>(out, specs, size, [&](iterator it) {
+  return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
     if (sign) *it++ = detail::sign<Char>(sign);
     *it++ = zero;
     if (!pointy) return it;
@@ -2543,32 +2654,31 @@ template <typename Char> class fallback_digit_grouping {
  public:
   constexpr fallback_digit_grouping(locale_ref, bool) {}
 
-  constexpr bool has_separator() const { return false; }
+  constexpr auto has_separator() const -> bool { return false; }
 
-  constexpr int count_separators(int) const { return 0; }
+  constexpr auto count_separators(int) const -> int { return 0; }
 
   template <typename Out, typename C>
-  constexpr Out apply(Out out, basic_string_view<C>) const {
+  constexpr auto apply(Out out, basic_string_view<C>) const -> Out {
     return out;
   }
 };
 
-template <typename OutputIt, typename DecimalFP, typename Char>
+template <typename Char, typename OutputIt, typename DecimalFP>
 FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& f,
-                                 const basic_format_specs<Char>& specs,
-                                 float_specs fspecs, locale_ref loc)
-    -> OutputIt {
+                                 const format_specs& specs, float_specs fspecs,
+                                 locale_ref loc) -> OutputIt {
   if (is_constant_evaluated()) {
-    return do_write_float<OutputIt, DecimalFP, Char,
+    return do_write_float<Char, OutputIt, DecimalFP,
                           fallback_digit_grouping<Char>>(out, f, specs, fspecs,
                                                          loc);
   } else {
-    return do_write_float(out, f, specs, fspecs, loc);
+    return do_write_float<Char>(out, f, specs, fspecs, loc);
   }
 }
 
-template <typename T> constexpr bool isnan(T value) {
-  return !(value >= value);  // std::isnan doesn't support __float128.
+template <typename T> constexpr auto isnan(T value) -> bool {
+  return value != value;  // std::isnan doesn't support __float128.
 }
 
 template <typename T, typename Enable = void>
@@ -2580,14 +2690,14 @@ struct has_isfinite<T, enable_if_t<sizeof(std::isfinite(T())) != 0>>
 
 template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value&&
                                         has_isfinite<T>::value)>
-FMT_CONSTEXPR20 bool isfinite(T value) {
+FMT_CONSTEXPR20 auto isfinite(T value) -> bool {
   constexpr T inf = T(std::numeric_limits<double>::infinity());
   if (is_constant_evaluated())
     return !detail::isnan(value) && value < inf && value > -inf;
   return std::isfinite(value);
 }
 template <typename T, FMT_ENABLE_IF(!has_isfinite<T>::value)>
-FMT_CONSTEXPR bool isfinite(T value) {
+FMT_CONSTEXPR auto isfinite(T value) -> bool {
   T inf = T(std::numeric_limits<double>::infinity());
   // std::isfinite doesn't support __float128.
   return !detail::isnan(value) && value < inf && value > -inf;
@@ -2606,78 +2716,6 @@ FMT_INLINE FMT_CONSTEXPR bool signbit(T value) {
   return std::signbit(static_cast<double>(value));
 }
 
-enum class round_direction { unknown, up, down };
-
-// Given the divisor (normally a power of 10), the remainder = v % divisor for
-// some number v and the error, returns whether v should be rounded up, down, or
-// whether the rounding direction can't be determined due to error.
-// error should be less than divisor / 2.
-FMT_CONSTEXPR inline round_direction get_round_direction(uint64_t divisor,
-                                                         uint64_t remainder,
-                                                         uint64_t error) {
-  FMT_ASSERT(remainder < divisor, "");  // divisor - remainder won't overflow.
-  FMT_ASSERT(error < divisor, "");      // divisor - error won't overflow.
-  FMT_ASSERT(error < divisor - error, "");  // error * 2 won't overflow.
-  // Round down if (remainder + error) * 2 <= divisor.
-  if (remainder <= divisor - remainder && error * 2 <= divisor - remainder * 2)
-    return round_direction::down;
-  // Round up if (remainder - error) * 2 >= divisor.
-  if (remainder >= error &&
-      remainder - error >= divisor - (remainder - error)) {
-    return round_direction::up;
-  }
-  return round_direction::unknown;
-}
-
-namespace digits {
-enum result {
-  more,  // Generate more digits.
-  done,  // Done generating digits.
-  error  // Digit generation cancelled due to an error.
-};
-}
-
-struct gen_digits_handler {
-  char* buf;
-  int size;
-  int precision;
-  int exp10;
-  bool fixed;
-
-  FMT_CONSTEXPR digits::result on_digit(char digit, uint64_t divisor,
-                                        uint64_t remainder, uint64_t error,
-                                        bool integral) {
-    FMT_ASSERT(remainder < divisor, "");
-    buf[size++] = digit;
-    if (!integral && error >= remainder) return digits::error;
-    if (size < precision) return digits::more;
-    if (!integral) {
-      // Check if error * 2 < divisor with overflow prevention.
-      // The check is not needed for the integral part because error = 1
-      // and divisor > (1 << 32) there.
-      if (error >= divisor || error >= divisor - error) return digits::error;
-    } else {
-      FMT_ASSERT(error == 1 && divisor > 2, "");
-    }
-    auto dir = get_round_direction(divisor, remainder, error);
-    if (dir != round_direction::up)
-      return dir == round_direction::down ? digits::done : digits::error;
-    ++buf[size - 1];
-    for (int i = size - 1; i > 0 && buf[i] > '9'; --i) {
-      buf[i] = '0';
-      ++buf[i - 1];
-    }
-    if (buf[0] > '9') {
-      buf[0] = '1';
-      if (fixed)
-        buf[size++] = '0';
-      else
-        ++exp10;
-    }
-    return digits::done;
-  }
-};
-
 inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) {
   // Adjust fixed precision by exponent because it is relative to decimal
   // point.
@@ -2686,101 +2724,6 @@ inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) {
   precision += exp10;
 }
 
-// Generates output using the Grisu digit-gen algorithm.
-// error: the size of the region (lower, upper) outside of which numbers
-// definitely do not round to value (Delta in Grisu3).
-FMT_INLINE FMT_CONSTEXPR20 auto grisu_gen_digits(fp value, uint64_t error,
-                                                 int& exp,
-                                                 gen_digits_handler& handler)
-    -> digits::result {
-  const fp one(1ULL << -value.e, value.e);
-  // The integral part of scaled value (p1 in Grisu) = value / one. It cannot be
-  // zero because it contains a product of two 64-bit numbers with MSB set (due
-  // to normalization) - 1, shifted right by at most 60 bits.
-  auto integral = static_cast<uint32_t>(value.f >> -one.e);
-  FMT_ASSERT(integral != 0, "");
-  FMT_ASSERT(integral == value.f >> -one.e, "");
-  // The fractional part of scaled value (p2 in Grisu) c = value % one.
-  uint64_t fractional = value.f & (one.f - 1);
-  exp = count_digits(integral);  // kappa in Grisu.
-  // Non-fixed formats require at least one digit and no precision adjustment.
-  if (handler.fixed) {
-    adjust_precision(handler.precision, exp + handler.exp10);
-    // Check if precision is satisfied just by leading zeros, e.g.
-    // format("{:.2f}", 0.001) gives "0.00" without generating any digits.
-    if (handler.precision <= 0) {
-      if (handler.precision < 0) return digits::done;
-      // Divide by 10 to prevent overflow.
-      uint64_t divisor = data::power_of_10_64[exp - 1] << -one.e;
-      auto dir = get_round_direction(divisor, value.f / 10, error * 10);
-      if (dir == round_direction::unknown) return digits::error;
-      handler.buf[handler.size++] = dir == round_direction::up ? '1' : '0';
-      return digits::done;
-    }
-  }
-  // Generate digits for the integral part. This can produce up to 10 digits.
-  do {
-    uint32_t digit = 0;
-    auto divmod_integral = [&](uint32_t divisor) {
-      digit = integral / divisor;
-      integral %= divisor;
-    };
-    // This optimization by Milo Yip reduces the number of integer divisions by
-    // one per iteration.
-    switch (exp) {
-    case 10:
-      divmod_integral(1000000000);
-      break;
-    case 9:
-      divmod_integral(100000000);
-      break;
-    case 8:
-      divmod_integral(10000000);
-      break;
-    case 7:
-      divmod_integral(1000000);
-      break;
-    case 6:
-      divmod_integral(100000);
-      break;
-    case 5:
-      divmod_integral(10000);
-      break;
-    case 4:
-      divmod_integral(1000);
-      break;
-    case 3:
-      divmod_integral(100);
-      break;
-    case 2:
-      divmod_integral(10);
-      break;
-    case 1:
-      digit = integral;
-      integral = 0;
-      break;
-    default:
-      FMT_ASSERT(false, "invalid number of digits");
-    }
-    --exp;
-    auto remainder = (static_cast<uint64_t>(integral) << -one.e) + fractional;
-    auto result = handler.on_digit(static_cast<char>('0' + digit),
-                                   data::power_of_10_64[exp] << -one.e,
-                                   remainder, error, true);
-    if (result != digits::more) return result;
-  } while (exp > 0);
-  // Generate digits for the fractional part.
-  for (;;) {
-    fractional *= 10;
-    error *= 10;
-    char digit = static_cast<char>('0' + (fractional >> -one.e));
-    fractional &= one.f - 1;
-    --exp;
-    auto result = handler.on_digit(digit, one.f, fractional, error, false);
-    if (result != digits::more) return result;
-  }
-}
-
 class bigint {
  private:
   // A bigint is stored as an array of bigits (big digits), with bigit at index
@@ -2791,10 +2734,10 @@ class bigint {
   basic_memory_buffer<bigit, bigits_capacity> bigits_;
   int exp_;
 
-  FMT_CONSTEXPR20 bigit operator[](int index) const {
+  FMT_CONSTEXPR20 auto operator[](int index) const -> bigit {
     return bigits_[to_unsigned(index)];
   }
-  FMT_CONSTEXPR20 bigit& operator[](int index) {
+  FMT_CONSTEXPR20 auto operator[](int index) -> bigit& {
     return bigits_[to_unsigned(index)];
   }
 
@@ -2881,7 +2824,7 @@ class bigint {
     auto size = other.bigits_.size();
     bigits_.resize(size);
     auto data = other.bigits_.data();
-    std::copy(data, data + size, make_checked(bigits_.data(), size));
+    copy<bigit>(data, data + size, bigits_.data());
     exp_ = other.exp_;
   }
 
@@ -2890,11 +2833,11 @@ class bigint {
     assign(uint64_or_128_t<Int>(n));
   }
 
-  FMT_CONSTEXPR20 int num_bigits() const {
+  FMT_CONSTEXPR20 auto num_bigits() const -> int {
     return static_cast<int>(bigits_.size()) + exp_;
   }
 
-  FMT_NOINLINE FMT_CONSTEXPR20 bigint& operator<<=(int shift) {
+  FMT_NOINLINE FMT_CONSTEXPR20 auto operator<<=(int shift) -> bigint& {
     FMT_ASSERT(shift >= 0, "");
     exp_ += shift / bigit_bits;
     shift %= bigit_bits;
@@ -2909,13 +2852,15 @@ class bigint {
     return *this;
   }
 
-  template <typename Int> FMT_CONSTEXPR20 bigint& operator*=(Int value) {
+  template <typename Int>
+  FMT_CONSTEXPR20 auto operator*=(Int value) -> bigint& {
     FMT_ASSERT(value > 0, "");
     multiply(uint32_or_64_or_128_t<Int>(value));
     return *this;
   }
 
-  friend FMT_CONSTEXPR20 int compare(const bigint& lhs, const bigint& rhs) {
+  friend FMT_CONSTEXPR20 auto compare(const bigint& lhs, const bigint& rhs)
+      -> int {
     int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
     if (num_lhs_bigits != num_rhs_bigits)
       return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
@@ -2932,8 +2877,9 @@ class bigint {
   }
 
   // Returns compare(lhs1 + lhs2, rhs).
-  friend FMT_CONSTEXPR20 int add_compare(const bigint& lhs1, const bigint& lhs2,
-                                         const bigint& rhs) {
+  friend FMT_CONSTEXPR20 auto add_compare(const bigint& lhs1,
+                                          const bigint& lhs2, const bigint& rhs)
+      -> int {
     auto minimum = [](int a, int b) { return a < b ? a : b; };
     auto maximum = [](int a, int b) { return a > b ? a : b; };
     int max_lhs_bigits = maximum(lhs1.num_bigits(), lhs2.num_bigits());
@@ -3014,13 +2960,13 @@ class bigint {
     bigits_.resize(to_unsigned(num_bigits + exp_difference));
     for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
       bigits_[j] = bigits_[i];
-    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0);
+    memset(bigits_.data(), 0, to_unsigned(exp_difference) * sizeof(bigit));
     exp_ -= exp_difference;
   }
 
   // Divides this bignum by divisor, assigning the remainder to this and
   // returning the quotient.
-  FMT_CONSTEXPR20 int divmod_assign(const bigint& divisor) {
+  FMT_CONSTEXPR20 auto divmod_assign(const bigint& divisor) -> int {
     FMT_ASSERT(this != &divisor, "");
     if (compare(*this, divisor) < 0) return 0;
     FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
@@ -3095,6 +3041,7 @@ FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
   }
   int even = static_cast<int>((value.f & 1) == 0);
   if (!upper) upper = &lower;
+  bool shortest = num_digits < 0;
   if ((flags & dragon::fixup) != 0) {
     if (add_compare(numerator, *upper, denominator) + even <= 0) {
       --exp10;
@@ -3107,7 +3054,7 @@ FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
     if ((flags & dragon::fixed) != 0) adjust_precision(num_digits, exp10 + 1);
   }
   // Invariant: value == (numerator / denominator) * pow(10, exp10).
-  if (num_digits < 0) {
+  if (shortest) {
     // Generate the shortest representation.
     num_digits = 0;
     char* data = buf.data();
@@ -3137,9 +3084,12 @@ FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
   }
   // Generate the given number of digits.
   exp10 -= num_digits - 1;
-  if (num_digits == 0) {
-    denominator *= 10;
-    auto digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+  if (num_digits <= 0) {
+    auto digit = '0';
+    if (num_digits == 0) {
+      denominator *= 10;
+      digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    }
     buf.push_back(digit);
     return;
   }
@@ -3162,7 +3112,10 @@ FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
       }
       if (buf[0] == overflow) {
         buf[0] = '1';
-        ++exp10;
+        if ((flags & dragon::fixed) != 0)
+          buf.push_back('0');
+        else
+          ++exp10;
       }
       return;
     }
@@ -3171,6 +3124,105 @@ FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
   buf[num_digits - 1] = static_cast<char>('0' + digit);
 }
 
+// Formats a floating-point number using the hexfloat format.
+template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
+FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
+                                     buffer<char>& buf) {
+  // float is passed as double to reduce the number of instantiations and to
+  // simplify implementation.
+  static_assert(!std::is_same<Float, float>::value, "");
+
+  using info = dragonbox::float_info<Float>;
+
+  // Assume Float is in the format [sign][exponent][significand].
+  using carrier_uint = typename info::carrier_uint;
+
+  constexpr auto num_float_significand_bits =
+      detail::num_significand_bits<Float>();
+
+  basic_fp<carrier_uint> f(value);
+  f.e += num_float_significand_bits;
+  if (!has_implicit_bit<Float>()) --f.e;
+
+  constexpr auto num_fraction_bits =
+      num_float_significand_bits + (has_implicit_bit<Float>() ? 1 : 0);
+  constexpr auto num_xdigits = (num_fraction_bits + 3) / 4;
+
+  constexpr auto leading_shift = ((num_xdigits - 1) * 4);
+  const auto leading_mask = carrier_uint(0xF) << leading_shift;
+  const auto leading_xdigit =
+      static_cast<uint32_t>((f.f & leading_mask) >> leading_shift);
+  if (leading_xdigit > 1) f.e -= (32 - countl_zero(leading_xdigit) - 1);
+
+  int print_xdigits = num_xdigits - 1;
+  if (specs.precision >= 0 && print_xdigits > specs.precision) {
+    const int shift = ((print_xdigits - specs.precision - 1) * 4);
+    const auto mask = carrier_uint(0xF) << shift;
+    const auto v = static_cast<uint32_t>((f.f & mask) >> shift);
+
+    if (v >= 8) {
+      const auto inc = carrier_uint(1) << (shift + 4);
+      f.f += inc;
+      f.f &= ~(inc - 1);
+    }
+
+    // Check long double overflow
+    if (!has_implicit_bit<Float>()) {
+      const auto implicit_bit = carrier_uint(1) << num_float_significand_bits;
+      if ((f.f & implicit_bit) == implicit_bit) {
+        f.f >>= 4;
+        f.e += 4;
+      }
+    }
+
+    print_xdigits = specs.precision;
+  }
+
+  char xdigits[num_bits<carrier_uint>() / 4];
+  detail::fill_n(xdigits, sizeof(xdigits), '0');
+  format_uint<4>(xdigits, f.f, num_xdigits, specs.upper);
+
+  // Remove zero tail
+  while (print_xdigits > 0 && xdigits[print_xdigits] == '0') --print_xdigits;
+
+  buf.push_back('0');
+  buf.push_back(specs.upper ? 'X' : 'x');
+  buf.push_back(xdigits[0]);
+  if (specs.alt || print_xdigits > 0 || print_xdigits < specs.precision)
+    buf.push_back('.');
+  buf.append(xdigits + 1, xdigits + 1 + print_xdigits);
+  for (; print_xdigits < specs.precision; ++print_xdigits) buf.push_back('0');
+
+  buf.push_back(specs.upper ? 'P' : 'p');
+
+  uint32_t abs_e;
+  if (f.e < 0) {
+    buf.push_back('-');
+    abs_e = static_cast<uint32_t>(-f.e);
+  } else {
+    buf.push_back('+');
+    abs_e = static_cast<uint32_t>(f.e);
+  }
+  format_decimal<char>(appender(buf), abs_e, detail::count_digits(abs_e));
+}
+
+template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
+FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
+                                     buffer<char>& buf) {
+  format_hexfloat(static_cast<double>(value), specs, buf);
+}
+
+constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t {
+  // For checking rounding thresholds.
+  // The kth entry is chosen to be the smallest integer such that the
+  // upper 32-bits of 10^(k+1) times it is strictly bigger than 5 * 10^k.
+  // It is equal to ceil(2^31 + 2^32/10^(k + 1)).
+  // These are stored in a string literal because we cannot have static arrays
+  // in constexpr functions and non-static ones are poorly optimized.
+  return U"\x9999999a\x828f5c29\x80418938\x80068db9\x8000a7c6\x800010c7"
+         U"\x800001ae\x8000002b"[index];
+}
+
 template <typename Float>
 FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
                                   buffer<char>& buf) -> int {
@@ -3193,7 +3245,7 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
   int exp = 0;
   bool use_dragon = true;
   unsigned dragon_flags = 0;
-  if (!is_fast_float<Float>()) {
+  if (!is_fast_float<Float>() || is_constant_evaluated()) {
     const auto inv_log2_10 = 0.3010299956639812;  // 1 / log2(10)
     using info = dragonbox::float_info<decltype(converted_value)>;
     const auto f = basic_fp<typename info::carrier_uint>(converted_value);
@@ -3201,37 +3253,259 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
     //   10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1).
     // This is based on log10(value) == log2(value) / log2(10) and approximation
     // of log2(value) by e + num_fraction_bits idea from double-conversion.
-    exp = static_cast<int>(
-        std::ceil((f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10));
+    auto e = (f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10;
+    exp = static_cast<int>(e);
+    if (e > exp) ++exp;  // Compute ceil.
     dragon_flags = dragon::fixup;
-  } else if (!is_constant_evaluated() && precision < 0) {
+  } else if (precision < 0) {
     // Use Dragonbox for the shortest format.
     if (specs.binary32) {
       auto dec = dragonbox::to_decimal(static_cast<float>(value));
-      write<char>(buffer_appender<char>(buf), dec.significand);
+      write<char>(appender(buf), dec.significand);
       return dec.exponent;
     }
     auto dec = dragonbox::to_decimal(static_cast<double>(value));
-    write<char>(buffer_appender<char>(buf), dec.significand);
+    write<char>(appender(buf), dec.significand);
     return dec.exponent;
   } else {
-    // Use Grisu + Dragon4 for the given precision:
-    // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf.
-    const int min_exp = -60;  // alpha in Grisu.
-    int cached_exp10 = 0;     // K in Grisu.
-    fp normalized = normalize(fp(converted_value));
-    const auto cached_pow = get_cached_power(
-        min_exp - (normalized.e + fp::num_significand_bits), cached_exp10);
-    normalized = normalized * cached_pow;
-    gen_digits_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
-    if (grisu_gen_digits(normalized, 1, exp, handler) != digits::error &&
-        !is_constant_evaluated()) {
-      exp += handler.exp10;
-      buf.try_resize(to_unsigned(handler.size));
-      use_dragon = false;
+    // Extract significand bits and exponent bits.
+    using info = dragonbox::float_info<double>;
+    auto br = bit_cast<uint64_t>(static_cast<double>(value));
+
+    const uint64_t significand_mask =
+        (static_cast<uint64_t>(1) << num_significand_bits<double>()) - 1;
+    uint64_t significand = (br & significand_mask);
+    int exponent = static_cast<int>((br & exponent_mask<double>()) >>
+                                    num_significand_bits<double>());
+
+    if (exponent != 0) {  // Check if normal.
+      exponent -= exponent_bias<double>() + num_significand_bits<double>();
+      significand |=
+          (static_cast<uint64_t>(1) << num_significand_bits<double>());
+      significand <<= 1;
     } else {
-      exp += handler.size - cached_exp10 - 1;
-      precision = handler.precision;
+      // Normalize subnormal inputs.
+      FMT_ASSERT(significand != 0, "zeros should not appear here");
+      int shift = countl_zero(significand);
+      FMT_ASSERT(shift >= num_bits<uint64_t>() - num_significand_bits<double>(),
+                 "");
+      shift -= (num_bits<uint64_t>() - num_significand_bits<double>() - 2);
+      exponent = (std::numeric_limits<double>::min_exponent -
+                  num_significand_bits<double>()) -
+                 shift;
+      significand <<= shift;
+    }
+
+    // Compute the first several nonzero decimal significand digits.
+    // We call the number we get the first segment.
+    const int k = info::kappa - dragonbox::floor_log10_pow2(exponent);
+    exp = -k;
+    const int beta = exponent + dragonbox::floor_log2_pow10(k);
+    uint64_t first_segment;
+    bool has_more_segments;
+    int digits_in_the_first_segment;
+    {
+      const auto r = dragonbox::umul192_upper128(
+          significand << beta, dragonbox::get_cached_power(k));
+      first_segment = r.high();
+      has_more_segments = r.low() != 0;
+
+      // The first segment can have 18 ~ 19 digits.
+      if (first_segment >= 1000000000000000000ULL) {
+        digits_in_the_first_segment = 19;
+      } else {
+        // When it is of 18-digits, we align it to 19-digits by adding a bogus
+        // zero at the end.
+        digits_in_the_first_segment = 18;
+        first_segment *= 10;
+      }
+    }
+
+    // Compute the actual number of decimal digits to print.
+    if (fixed) adjust_precision(precision, exp + digits_in_the_first_segment);
+
+    // Use Dragon4 only when there might be not enough digits in the first
+    // segment.
+    if (digits_in_the_first_segment > precision) {
+      use_dragon = false;
+
+      if (precision <= 0) {
+        exp += digits_in_the_first_segment;
+
+        if (precision < 0) {
+          // Nothing to do, since all we have are just leading zeros.
+          buf.try_resize(0);
+        } else {
+          // We may need to round-up.
+          buf.try_resize(1);
+          if ((first_segment | static_cast<uint64_t>(has_more_segments)) >
+              5000000000000000000ULL) {
+            buf[0] = '1';
+          } else {
+            buf[0] = '0';
+          }
+        }
+      }  // precision <= 0
+      else {
+        exp += digits_in_the_first_segment - precision;
+
+        // When precision > 0, we divide the first segment into three
+        // subsegments, each with 9, 9, and 0 ~ 1 digits so that each fits
+        // in 32-bits which usually allows faster calculation than in
+        // 64-bits. Since some compiler (e.g. MSVC) doesn't know how to optimize
+        // division-by-constant for large 64-bit divisors, we do it here
+        // manually. The magic number 7922816251426433760 below is equal to
+        // ceil(2^(64+32) / 10^10).
+        const uint32_t first_subsegment = static_cast<uint32_t>(
+            dragonbox::umul128_upper64(first_segment, 7922816251426433760ULL) >>
+            32);
+        const uint64_t second_third_subsegments =
+            first_segment - first_subsegment * 10000000000ULL;
+
+        uint64_t prod;
+        uint32_t digits;
+        bool should_round_up;
+        int number_of_digits_to_print = precision > 9 ? 9 : precision;
+
+        // Print a 9-digits subsegment, either the first or the second.
+        auto print_subsegment = [&](uint32_t subsegment, char* buffer) {
+          int number_of_digits_printed = 0;
+
+          // If we want to print an odd number of digits from the subsegment,
+          if ((number_of_digits_to_print & 1) != 0) {
+            // Convert to 64-bit fixed-point fractional form with 1-digit
+            // integer part. The magic number 720575941 is a good enough
+            // approximation of 2^(32 + 24) / 10^8; see
+            // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
+            // for details.
+            prod = ((subsegment * static_cast<uint64_t>(720575941)) >> 24) + 1;
+            digits = static_cast<uint32_t>(prod >> 32);
+            *buffer = static_cast<char>('0' + digits);
+            number_of_digits_printed++;
+          }
+          // If we want to print an even number of digits from the
+          // first_subsegment,
+          else {
+            // Convert to 64-bit fixed-point fractional form with 2-digits
+            // integer part. The magic number 450359963 is a good enough
+            // approximation of 2^(32 + 20) / 10^7; see
+            // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
+            // for details.
+            prod = ((subsegment * static_cast<uint64_t>(450359963)) >> 20) + 1;
+            digits = static_cast<uint32_t>(prod >> 32);
+            copy2(buffer, digits2(digits));
+            number_of_digits_printed += 2;
+          }
+
+          // Print all digit pairs.
+          while (number_of_digits_printed < number_of_digits_to_print) {
+            prod = static_cast<uint32_t>(prod) * static_cast<uint64_t>(100);
+            digits = static_cast<uint32_t>(prod >> 32);
+            copy2(buffer + number_of_digits_printed, digits2(digits));
+            number_of_digits_printed += 2;
+          }
+        };
+
+        // Print first subsegment.
+        print_subsegment(first_subsegment, buf.data());
+
+        // Perform rounding if the first subsegment is the last subsegment to
+        // print.
+        if (precision <= 9) {
+          // Rounding inside the subsegment.
+          // We round-up if:
+          //  - either the fractional part is strictly larger than 1/2, or
+          //  - the fractional part is exactly 1/2 and the last digit is odd.
+          // We rely on the following observations:
+          //  - If fractional_part >= threshold, then the fractional part is
+          //    strictly larger than 1/2.
+          //  - If the MSB of fractional_part is set, then the fractional part
+          //    must be at least 1/2.
+          //  - When the MSB of fractional_part is set, either
+          //    second_third_subsegments being nonzero or has_more_segments
+          //    being true means there are further digits not printed, so the
+          //    fractional part is strictly larger than 1/2.
+          if (precision < 9) {
+            uint32_t fractional_part = static_cast<uint32_t>(prod);
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (second_third_subsegments != 0) |
+                  has_more_segments)) != 0;
+          }
+          // Rounding at the subsegment boundary.
+          // In this case, the fractional part is at least 1/2 if and only if
+          // second_third_subsegments >= 5000000000ULL, and is strictly larger
+          // than 1/2 if we further have either second_third_subsegments >
+          // 5000000000ULL or has_more_segments == true.
+          else {
+            should_round_up = second_third_subsegments > 5000000000ULL ||
+                              (second_third_subsegments == 5000000000ULL &&
+                               ((digits & 1) != 0 || has_more_segments));
+          }
+        }
+        // Otherwise, print the second subsegment.
+        else {
+          // Compilers are not aware of how to leverage the maximum value of
+          // second_third_subsegments to find out a better magic number which
+          // allows us to eliminate an additional shift. 1844674407370955162 =
+          // ceil(2^64/10) < ceil(2^64*(10^9/(10^10 - 1))).
+          const uint32_t second_subsegment =
+              static_cast<uint32_t>(dragonbox::umul128_upper64(
+                  second_third_subsegments, 1844674407370955162ULL));
+          const uint32_t third_subsegment =
+              static_cast<uint32_t>(second_third_subsegments) -
+              second_subsegment * 10;
+
+          number_of_digits_to_print = precision - 9;
+          print_subsegment(second_subsegment, buf.data() + 9);
+
+          // Rounding inside the subsegment.
+          if (precision < 18) {
+            // The condition third_subsegment != 0 implies that the segment was
+            // of 19 digits, so in this case the third segment should be
+            // consisting of a genuine digit from the input.
+            uint32_t fractional_part = static_cast<uint32_t>(prod);
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (third_subsegment != 0) |
+                  has_more_segments)) != 0;
+          }
+          // Rounding at the subsegment boundary.
+          else {
+            // In this case, the segment must be of 19 digits, thus
+            // the third subsegment should be consisting of a genuine digit from
+            // the input.
+            should_round_up = third_subsegment > 5 ||
+                              (third_subsegment == 5 &&
+                               ((digits & 1) != 0 || has_more_segments));
+          }
+        }
+
+        // Round-up if necessary.
+        if (should_round_up) {
+          ++buf[precision - 1];
+          for (int i = precision - 1; i > 0 && buf[i] > '9'; --i) {
+            buf[i] = '0';
+            ++buf[i - 1];
+          }
+          if (buf[0] > '9') {
+            buf[0] = '1';
+            if (fixed)
+              buf[precision++] = '0';
+            else
+              ++exp;
+          }
+        }
+        buf.try_resize(to_unsigned(precision));
+      }
+    }  // if (digits_in_the_first_segment > precision)
+    else {
+      // Adjust the exponent for its use in Dragon4.
+      exp += digits_in_the_first_segment - 1;
     }
   }
   if (use_dragon) {
@@ -3258,100 +3532,102 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
   }
   return exp;
 }
+
 template <typename Char, typename OutputIt, typename T>
-FMT_CONSTEXPR20 auto write_float(OutputIt out, T value,
-                                 basic_format_specs<Char> specs, locale_ref loc)
-    -> OutputIt {
-  float_specs fspecs = parse_float_type_spec(specs);
-  fspecs.sign = specs.sign;
+FMT_CONSTEXPR20 auto write_float(OutputIt out, T value, format_specs specs,
+                                 locale_ref loc) -> OutputIt {
+  sign_t sign = specs.sign;
   if (detail::signbit(value)) {  // value < 0 is false for NaN so use signbit.
-    fspecs.sign = sign::minus;
+    sign = sign::minus;
     value = -value;
-  } else if (fspecs.sign == sign::minus) {
-    fspecs.sign = sign::none;
+  } else if (sign == sign::minus) {
+    sign = sign::none;
   }
 
   if (!detail::isfinite(value))
-    return write_nonfinite(out, detail::isnan(value), specs, fspecs);
+    return write_nonfinite<Char>(out, detail::isnan(value), specs, sign);
 
-  if (specs.align == align::numeric && fspecs.sign) {
+  if (specs.align == align::numeric && sign) {
     auto it = reserve(out, 1);
-    *it++ = detail::sign<Char>(fspecs.sign);
+    *it++ = detail::sign<Char>(sign);
     out = base_iterator(out, it);
-    fspecs.sign = sign::none;
+    sign = sign::none;
     if (specs.width != 0) --specs.width;
   }
 
   memory_buffer buffer;
-  if (fspecs.format == float_format::hex) {
-    if (fspecs.sign) buffer.push_back(detail::sign<char>(fspecs.sign));
-    snprintf_float(convert_float(value), specs.precision, fspecs, buffer);
-    return write_bytes<align::right>(out, {buffer.data(), buffer.size()},
-                                     specs);
+  if (specs.type == presentation_type::hexfloat) {
+    if (sign) buffer.push_back(detail::sign<char>(sign));
+    format_hexfloat(convert_float(value), specs, buffer);
+    return write_bytes<Char, align::right>(out, {buffer.data(), buffer.size()},
+                                           specs);
   }
+
   int precision = specs.precision >= 0 || specs.type == presentation_type::none
                       ? specs.precision
                       : 6;
-  if (fspecs.format == float_format::exp) {
+  if (specs.type == presentation_type::exp) {
     if (precision == max_value<int>())
-      throw_format_error("number is too big");
+      report_error("number is too big");
     else
       ++precision;
-  } else if (fspecs.format != float_format::fixed && precision == 0) {
+  } else if (specs.type != presentation_type::fixed && precision == 0) {
     precision = 1;
   }
+  float_specs fspecs = parse_float_type_spec(specs);
+  fspecs.sign = sign;
   if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
   int exp = format_float(convert_float(value), precision, fspecs, buffer);
   fspecs.precision = precision;
   auto f = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
-  return write_float(out, f, specs, fspecs, loc);
+  return write_float<Char>(out, f, specs, fspecs, loc);
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_floating_point<T>::value)>
-FMT_CONSTEXPR20 auto write(OutputIt out, T value,
-                           basic_format_specs<Char> specs, locale_ref loc = {})
-    -> OutputIt {
+FMT_CONSTEXPR20 auto write(OutputIt out, T value, format_specs specs,
+                           locale_ref loc = {}) -> OutputIt {
   if (const_check(!is_supported_floating_point(value))) return out;
   return specs.localized && write_loc(out, value, specs, loc)
              ? out
-             : write_float(out, value, specs, loc);
+             : write_float<Char>(out, value, specs, loc);
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_fast_float<T>::value)>
 FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
-  if (is_constant_evaluated())
-    return write(out, value, basic_format_specs<Char>());
+  if (is_constant_evaluated()) return write<Char>(out, value, format_specs());
   if (const_check(!is_supported_floating_point(value))) return out;
 
-  auto fspecs = float_specs();
+  auto sign = sign_t::none;
   if (detail::signbit(value)) {
-    fspecs.sign = sign::minus;
+    sign = sign::minus;
     value = -value;
   }
 
-  constexpr auto specs = basic_format_specs<Char>();
+  constexpr auto specs = format_specs();
   using floaty = conditional_t<std::is_same<T, long double>::value, double, T>;
-  using uint = typename dragonbox::float_info<floaty>::carrier_uint;
-  uint mask = exponent_mask<floaty>();
-  if ((bit_cast<uint>(value) & mask) == mask)
-    return write_nonfinite(out, std::isnan(value), specs, fspecs);
+  using floaty_uint = typename dragonbox::float_info<floaty>::carrier_uint;
+  floaty_uint mask = exponent_mask<floaty>();
+  if ((bit_cast<floaty_uint>(value) & mask) == mask)
+    return write_nonfinite<Char>(out, std::isnan(value), specs, sign);
 
+  auto fspecs = float_specs();
+  fspecs.sign = sign;
   auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
-  return write_float(out, dec, specs, fspecs, {});
+  return write_float<Char>(out, dec, specs, fspecs, {});
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_floating_point<T>::value &&
                         !is_fast_float<T>::value)>
 inline auto write(OutputIt out, T value) -> OutputIt {
-  return write(out, value, basic_format_specs<Char>());
+  return write<Char>(out, value, format_specs());
 }
 
 template <typename Char, typename OutputIt>
-auto write(OutputIt out, monostate, basic_format_specs<Char> = {},
-           locale_ref = {}) -> OutputIt {
+auto write(OutputIt out, monostate, format_specs = {}, locale_ref = {})
+    -> OutputIt {
   FMT_ASSERT(false, "");
   return out;
 }
@@ -3359,13 +3635,11 @@ auto write(OutputIt out, monostate, basic_format_specs<Char> = {},
 template <typename Char, typename OutputIt>
 FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> value)
     -> OutputIt {
-  auto it = reserve(out, value.size());
-  it = copy_str_noinline<Char>(value.begin(), value.end(), it);
-  return base_iterator(out, it);
+  return copy_noinline<Char>(value.begin(), value.end(), out);
 }
 
 template <typename Char, typename OutputIt, typename T,
-          FMT_ENABLE_IF(is_string<T>::value)>
+          FMT_ENABLE_IF(has_to_string_view<T>::value)>
 constexpr auto write(OutputIt out, const T& value) -> OutputIt {
   return write<Char>(out, to_string_view(value));
 }
@@ -3384,13 +3658,12 @@ FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(std::is_same<T, bool>::value)>
-FMT_CONSTEXPR auto write(OutputIt out, T value,
-                         const basic_format_specs<Char>& specs = {},
+FMT_CONSTEXPR auto write(OutputIt out, T value, const format_specs& specs = {},
                          locale_ref = {}) -> OutputIt {
   return specs.type != presentation_type::none &&
                  specs.type != presentation_type::string
-             ? write(out, value ? 1 : 0, specs, {})
-             : write_bytes(out, value ? "true" : "false", specs);
+             ? write<Char>(out, value ? 1 : 0, specs, {})
+             : write_bytes<Char>(out, value ? "true" : "false", specs);
 }
 
 template <typename Char, typename OutputIt>
@@ -3401,22 +3674,16 @@ FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt {
 }
 
 template <typename Char, typename OutputIt>
-FMT_CONSTEXPR_CHAR_TRAITS auto write(OutputIt out, const Char* value)
-    -> OutputIt {
-  if (!value) {
-    throw_format_error("string pointer is null");
-  } else {
-    out = write(out, basic_string_view<Char>(value));
-  }
+FMT_CONSTEXPR20 auto write(OutputIt out, const Char* value) -> OutputIt {
+  if (value) return write(out, basic_string_view<Char>(value));
+  report_error("string pointer is null");
   return out;
 }
 
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(std::is_same<T, void>::value)>
-auto write(OutputIt out, const T* value,
-           const basic_format_specs<Char>& specs = {}, locale_ref = {})
-    -> OutputIt {
-  check_pointer_type_spec(specs.type, error_handler());
+auto write(OutputIt out, const T* value, const format_specs& specs = {},
+           locale_ref = {}) -> OutputIt {
   return write_ptr<Char>(out, bit_cast<uintptr_t>(value), &specs);
 }
 
@@ -3424,7 +3691,7 @@ auto write(OutputIt out, const T* value,
 template <typename Char, typename OutputIt, typename T,
           typename Context = basic_format_context<OutputIt, Char>>
 FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> enable_if_t<
-    std::is_class<T>::value && !is_string<T>::value &&
+    std::is_class<T>::value && !has_to_string_view<T>::value &&
         !is_floating_point<T>::value && !std::is_same<T, Char>::value &&
         !std::is_same<T, remove_cvref_t<decltype(arg_mapper<Context>().map(
                              value))>>::value,
@@ -3435,21 +3702,22 @@ FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> enable_if_t<
 template <typename Char, typename OutputIt, typename T,
           typename Context = basic_format_context<OutputIt, Char>>
 FMT_CONSTEXPR auto write(OutputIt out, const T& value)
-    -> enable_if_t<mapped_type_constant<T, Context>::value == type::custom_type,
+    -> enable_if_t<mapped_type_constant<T, Context>::value ==
+                           type::custom_type &&
+                       !std::is_fundamental<T>::value,
                    OutputIt> {
-  using formatter_type =
-      conditional_t<has_formatter<T, Context>::value,
-                    typename Context::template formatter_type<T>,
-                    fallback_formatter<T, Char>>;
+  auto formatter = typename Context::template formatter_type<T>();
+  auto parse_ctx = typename Context::parse_context_type({});
+  formatter.parse(parse_ctx);
   auto ctx = Context(out, {}, {});
-  return formatter_type().format(value, ctx);
+  return formatter.format(value, ctx);
 }
 
 // An argument visitor that formats the argument and writes it via the output
 // iterator. It's a class and not a generic lambda for compatibility with C++11.
 template <typename Char> struct default_arg_formatter {
-  using iterator = buffer_appender<Char>;
-  using context = buffer_context<Char>;
+  using iterator = basic_appender<Char>;
+  using context = buffered_context<Char>;
 
   iterator out;
   basic_format_args<context> args;
@@ -3467,16 +3735,16 @@ template <typename Char> struct default_arg_formatter {
 };
 
 template <typename Char> struct arg_formatter {
-  using iterator = buffer_appender<Char>;
-  using context = buffer_context<Char>;
+  using iterator = basic_appender<Char>;
+  using context = buffered_context<Char>;
 
   iterator out;
-  const basic_format_specs<Char>& specs;
+  const format_specs& specs;
   locale_ref locale;
 
   template <typename T>
   FMT_CONSTEXPR FMT_INLINE auto operator()(T value) -> iterator {
-    return detail::write(out, value, specs, locale);
+    return detail::write<Char>(out, value, specs, locale);
   }
   auto operator()(typename basic_format_arg<context>::handle) -> iterator {
     // User-defined types are handled separately because they require access
@@ -3485,116 +3753,49 @@ template <typename Char> struct arg_formatter {
   }
 };
 
-template <typename Char> struct custom_formatter {
-  basic_format_parse_context<Char>& parse_ctx;
-  buffer_context<Char>& ctx;
-
-  void operator()(
-      typename basic_format_arg<buffer_context<Char>>::handle h) const {
-    h.format(parse_ctx, ctx);
-  }
-  template <typename T> void operator()(T) const {}
-};
-
-template <typename ErrorHandler> class width_checker {
- public:
-  explicit FMT_CONSTEXPR width_checker(ErrorHandler& eh) : handler_(eh) {}
-
+struct width_checker {
   template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
-    if (is_negative(value)) handler_.on_error("negative width");
+    if (is_negative(value)) report_error("negative width");
     return static_cast<unsigned long long>(value);
   }
 
   template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
-    handler_.on_error("width is not integer");
+    report_error("width is not integer");
     return 0;
   }
-
- private:
-  ErrorHandler& handler_;
 };
 
-template <typename ErrorHandler> class precision_checker {
- public:
-  explicit FMT_CONSTEXPR precision_checker(ErrorHandler& eh) : handler_(eh) {}
-
+struct precision_checker {
   template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
-    if (is_negative(value)) handler_.on_error("negative precision");
+    if (is_negative(value)) report_error("negative precision");
     return static_cast<unsigned long long>(value);
   }
 
   template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
   FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
-    handler_.on_error("precision is not integer");
+    report_error("precision is not integer");
     return 0;
   }
-
- private:
-  ErrorHandler& handler_;
 };
 
-template <template <typename> class Handler, typename FormatArg,
-          typename ErrorHandler>
-FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg, ErrorHandler eh) -> int {
-  unsigned long long value = visit_format_arg(Handler<ErrorHandler>(eh), arg);
-  if (value > to_unsigned(max_value<int>())) eh.on_error("number is too big");
+template <typename Handler, typename FormatArg>
+FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg) -> int {
+  unsigned long long value = arg.visit(Handler());
+  if (value > to_unsigned(max_value<int>())) report_error("number is too big");
   return static_cast<int>(value);
 }
 
 template <typename Context, typename ID>
-FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) ->
-    typename Context::format_arg {
+FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) -> decltype(ctx.arg(id)) {
   auto arg = ctx.arg(id);
-  if (!arg) ctx.on_error("argument not found");
+  if (!arg) report_error("argument not found");
   return arg;
 }
 
-// The standard format specifier handler with checking.
-template <typename Char> class specs_handler : public specs_setter<Char> {
- private:
-  basic_format_parse_context<Char>& parse_context_;
-  buffer_context<Char>& context_;
-
-  // This is only needed for compatibility with gcc 4.4.
-  using format_arg = basic_format_arg<buffer_context<Char>>;
-
-  FMT_CONSTEXPR auto get_arg(auto_id) -> format_arg {
-    return detail::get_arg(context_, parse_context_.next_arg_id());
-  }
-
-  FMT_CONSTEXPR auto get_arg(int arg_id) -> format_arg {
-    parse_context_.check_arg_id(arg_id);
-    return detail::get_arg(context_, arg_id);
-  }
-
-  FMT_CONSTEXPR auto get_arg(basic_string_view<Char> arg_id) -> format_arg {
-    parse_context_.check_arg_id(arg_id);
-    return detail::get_arg(context_, arg_id);
-  }
-
- public:
-  FMT_CONSTEXPR specs_handler(basic_format_specs<Char>& specs,
-                              basic_format_parse_context<Char>& parse_ctx,
-                              buffer_context<Char>& ctx)
-      : specs_setter<Char>(specs), parse_context_(parse_ctx), context_(ctx) {}
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_width(Id arg_id) {
-    this->specs_.width = get_dynamic_spec<width_checker>(
-        get_arg(arg_id), context_.error_handler());
-  }
-
-  template <typename Id> FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) {
-    this->specs_.precision = get_dynamic_spec<precision_checker>(
-        get_arg(arg_id), context_.error_handler());
-  }
-
-  void on_error(const char* message) { context_.on_error(message); }
-};
-
-template <template <typename> class Handler, typename Context>
+template <typename Handler, typename Context>
 FMT_CONSTEXPR void handle_dynamic_spec(int& value,
                                        arg_ref<typename Context::char_type> ref,
                                        Context& ctx) {
@@ -3602,26 +3803,15 @@ FMT_CONSTEXPR void handle_dynamic_spec(int& value,
   case arg_id_kind::none:
     break;
   case arg_id_kind::index:
-    value = detail::get_dynamic_spec<Handler>(ctx.arg(ref.val.index),
-                                              ctx.error_handler());
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.index));
     break;
   case arg_id_kind::name:
-    value = detail::get_dynamic_spec<Handler>(ctx.arg(ref.val.name),
-                                              ctx.error_handler());
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.name));
     break;
   }
 }
 
 #if FMT_USE_USER_DEFINED_LITERALS
-template <typename Char> struct udl_formatter {
-  basic_string_view<Char> str;
-
-  template <typename... T>
-  auto operator()(T&&... args) const -> std::basic_string<Char> {
-    return vformat(str, fmt::make_format_args<buffer_context<Char>>(args...));
-  }
-};
-
 #  if FMT_USE_NONTYPE_TEMPLATE_ARGS
 template <typename T, typename Char, size_t N,
           fmt::detail_exported::fixed_string<Char, N> Str>
@@ -3660,12 +3850,12 @@ template <typename Char> struct udl_arg {
 #endif  // FMT_USE_USER_DEFINED_LITERALS
 
 template <typename Locale, typename Char>
-auto vformat(const Locale& loc, basic_string_view<Char> format_str,
-             basic_format_args<buffer_context<type_identity_t<Char>>> args)
+auto vformat(const Locale& loc, basic_string_view<Char> fmt,
+             typename detail::vformat_args<Char>::type args)
     -> std::basic_string<Char> {
-  basic_memory_buffer<Char> buffer;
-  detail::vformat_to(buffer, format_str, args, detail::locale_ref(loc));
-  return {buffer.data(), buffer.size()};
+  auto buf = basic_memory_buffer<Char>();
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return {buf.data(), buf.size()};
 }
 
 using format_func = void (*)(detail::buffer<char>&, int, const char*);
@@ -3673,53 +3863,48 @@ using format_func = void (*)(detail::buffer<char>&, int, const char*);
 FMT_API void format_error_code(buffer<char>& out, int error_code,
                                string_view message) noexcept;
 
+using fmt::report_error;
 FMT_API void report_error(format_func func, int error_code,
                           const char* message) noexcept;
-FMT_END_DETAIL_NAMESPACE
+}  // namespace detail
 
-#if !__NVCC__
+FMT_BEGIN_EXPORT
 FMT_API auto vsystem_error(int error_code, string_view format_str,
                            format_args args) -> std::system_error;
-#endif
+
 /**
- \rst
- Constructs :class:`std::system_error` with a message formatted with
- ``fmt::format(fmt, args...)``.
-  *error_code* is a system error code as given by ``errno``.
-
- **Example**::
-
-   // This throws std::system_error with the description
-   //   cannot open file 'madeup': No such file or directory
-   // or similar (system message may vary).
-   const char* filename = "madeup";
-   std::FILE* file = std::fopen(filename, "r");
-   if (!file)
-     throw fmt::system_error(errno, "cannot open file '{}'", filename);
- \endrst
-*/
-#if !__NVCC__
+ * Constructs `std::system_error` with a message formatted with
+ * `fmt::format(fmt, args...)`.
+ * `error_code` is a system error code as given by `errno`.
+ *
+ * **Example**:
+ *
+ *     // This throws std::system_error with the description
+ *     //   cannot open file 'madeup': No such file or directory
+ *     // or similar (system message may vary).
+ *     const char* filename = "madeup";
+ *     std::FILE* file = std::fopen(filename, "r");
+ *     if (!file)
+ *       throw fmt::system_error(errno, "cannot open file '{}'", filename);
+ */
 template <typename... T>
 auto system_error(int error_code, format_string<T...> fmt, T&&... args)
     -> std::system_error {
   return vsystem_error(error_code, fmt, fmt::make_format_args(args...));
 }
-#endif
+
 /**
-  \rst
-  Formats an error message for an error returned by an operating system or a
-  language runtime, for example a file opening error, and writes it to *out*.
-  The format is the same as the one used by ``std::system_error(ec, message)``
-  where ``ec`` is ``std::error_code(error_code, std::generic_category()})``.
-  It is implementation-defined but normally looks like:
-
-  .. parsed-literal::
-     *<message>*: *<system-message>*
-
-  where *<message>* is the passed message and *<system-message>* is the system
-  message corresponding to the error code.
-  *error_code* is a system error code as given by ``errno``.
-  \endrst
+ * Formats an error message for an error returned by an operating system or a
+ * language runtime, for example a file opening error, and writes it to `out`.
+ * The format is the same as the one used by `std::system_error(ec, message)`
+ * where `ec` is `std::error_code(error_code, std::generic_category())`.
+ * It is implementation-defined but normally looks like:
+ *
+ *     <message>: <system-message>
+ *
+ * where `<message>` is the passed message and `<system-message>` is the system
+ * message corresponding to the error code.
+ * `error_code` is a system error code as given by `errno`.
  */
 FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
                                  const char* message) noexcept;
@@ -3728,7 +3913,7 @@ FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
 // Can be used to report errors from destructors.
 FMT_API void report_system_error(int error_code, const char* message) noexcept;
 
-/** Fast integer formatter. */
+/// A fast integer formatter.
 class format_int {
  private:
   // Buffer should be large enough to hold all digits (digits10 + 1),
@@ -3737,12 +3922,14 @@ class format_int {
   mutable char buffer_[buffer_size];
   char* str_;
 
-  template <typename UInt> auto format_unsigned(UInt value) -> char* {
+  template <typename UInt>
+  FMT_CONSTEXPR20 auto format_unsigned(UInt value) -> char* {
     auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
     return detail::format_decimal(buffer_, n, buffer_size - 1).begin;
   }
 
-  template <typename Int> auto format_signed(Int value) -> char* {
+  template <typename Int>
+  FMT_CONSTEXPR20 auto format_signed(Int value) -> char* {
     auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
     bool negative = value < 0;
     if (negative) abs_value = 0 - abs_value;
@@ -3752,160 +3939,94 @@ class format_int {
   }
 
  public:
-  explicit format_int(int value) : str_(format_signed(value)) {}
-  explicit format_int(long value) : str_(format_signed(value)) {}
-  explicit format_int(long long value) : str_(format_signed(value)) {}
-  explicit format_int(unsigned value) : str_(format_unsigned(value)) {}
-  explicit format_int(unsigned long value) : str_(format_unsigned(value)) {}
-  explicit format_int(unsigned long long value)
+  explicit FMT_CONSTEXPR20 format_int(int value) : str_(format_signed(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(long value)
+      : str_(format_signed(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(long long value)
+      : str_(format_signed(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(unsigned value)
+      : str_(format_unsigned(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(unsigned long value)
+      : str_(format_unsigned(value)) {}
+  explicit FMT_CONSTEXPR20 format_int(unsigned long long value)
       : str_(format_unsigned(value)) {}
 
-  /** Returns the number of characters written to the output buffer. */
-  auto size() const -> size_t {
+  /// Returns the number of characters written to the output buffer.
+  FMT_CONSTEXPR20 auto size() const -> size_t {
     return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
   }
 
-  /**
-    Returns a pointer to the output buffer content. No terminating null
-    character is appended.
-   */
-  auto data() const -> const char* { return str_; }
+  /// Returns a pointer to the output buffer content. No terminating null
+  /// character is appended.
+  FMT_CONSTEXPR20 auto data() const -> const char* { return str_; }
 
-  /**
-    Returns a pointer to the output buffer content with terminating null
-    character appended.
-   */
-  auto c_str() const -> const char* {
+  /// Returns a pointer to the output buffer content with terminating null
+  /// character appended.
+  FMT_CONSTEXPR20 auto c_str() const -> const char* {
     buffer_[buffer_size - 1] = '\0';
     return str_;
   }
 
-  /**
-    \rst
-    Returns the content of the output buffer as an ``std::string``.
-    \endrst
-   */
+  /// Returns the content of the output buffer as an `std::string`.
   auto str() const -> std::string { return std::string(str_, size()); }
 };
 
 template <typename T, typename Char>
-template <typename FormatContext>
-FMT_CONSTEXPR FMT_INLINE auto
-formatter<T, Char,
-          enable_if_t<detail::type_constant<T, Char>::value !=
-                      detail::type::custom_type>>::format(const T& val,
-                                                          FormatContext& ctx)
-    const -> decltype(ctx.out()) {
-  if (specs_.width_ref.kind != detail::arg_id_kind::none ||
-      specs_.precision_ref.kind != detail::arg_id_kind::none) {
-    auto specs = specs_;
-    detail::handle_dynamic_spec<detail::width_checker>(specs.width,
-                                                       specs.width_ref, ctx);
-    detail::handle_dynamic_spec<detail::precision_checker>(
-        specs.precision, specs.precision_ref, ctx);
-    return detail::write<Char>(ctx.out(), val, specs, ctx.locale());
-  }
-  return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
-}
-
-template <typename Char>
-struct formatter<void*, Char> : formatter<const void*, Char> {
-  template <typename FormatContext>
-  auto format(void* val, FormatContext& ctx) const -> decltype(ctx.out()) {
-    return formatter<const void*, Char>::format(val, ctx);
-  }
-};
-
-template <typename Char, size_t N>
-struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {
+struct formatter<T, Char, enable_if_t<detail::has_format_as<T>::value>>
+    : formatter<detail::format_as_t<T>, Char> {
   template <typename FormatContext>
-  FMT_CONSTEXPR auto format(const Char* val, FormatContext& ctx) const
-      -> decltype(ctx.out()) {
-    return formatter<basic_string_view<Char>, Char>::format(val, ctx);
+  auto format(const T& value, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto&& val = format_as(value);  // Make an lvalue reference for format.
+    return formatter<detail::format_as_t<T>, Char>::format(val, ctx);
   }
 };
 
-// A formatter for types known only at run time such as variant alternatives.
-//
-// Usage:
-//   using variant = std::variant<int, std::string>;
-//   template <>
-//   struct formatter<variant>: dynamic_formatter<> {
-//     auto format(const variant& v, format_context& ctx) {
-//       return visit([&](const auto& val) {
-//           return dynamic_formatter<>::format(val, ctx);
-//       }, v);
-//     }
-//   };
-template <typename Char = char> class dynamic_formatter {
- private:
-  detail::dynamic_format_specs<Char> specs_;
-  const Char* format_str_;
-
-  struct null_handler : detail::error_handler {
-    void on_align(align_t) {}
-    void on_sign(sign_t) {}
-    void on_hash() {}
-  };
+#define FMT_FORMAT_AS(Type, Base)                                              \
+  template <typename Char>                                                     \
+  struct formatter<Type, Char> : formatter<Base, Char> {                       \
+    template <typename FormatContext>                                          \
+    auto format(Type value, FormatContext& ctx) const -> decltype(ctx.out()) { \
+      return formatter<Base, Char>::format(value, ctx);                        \
+    }                                                                          \
+  }
+
+FMT_FORMAT_AS(signed char, int);
+FMT_FORMAT_AS(unsigned char, unsigned);
+FMT_FORMAT_AS(short, int);
+FMT_FORMAT_AS(unsigned short, unsigned);
+FMT_FORMAT_AS(long, detail::long_type);
+FMT_FORMAT_AS(unsigned long, detail::ulong_type);
+FMT_FORMAT_AS(Char*, const Char*);
+FMT_FORMAT_AS(std::nullptr_t, const void*);
+FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
+FMT_FORMAT_AS(void*, const void*);
+
+template <typename Char, typename Traits, typename Allocator>
+class formatter<std::basic_string<Char, Traits, Allocator>, Char>
+    : public formatter<basic_string_view<Char>, Char> {};
 
-  template <typename Context> void handle_specs(Context& ctx) {
-    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
-                                                       specs_.width_ref, ctx);
-    detail::handle_dynamic_spec<detail::precision_checker>(
-        specs_.precision, specs_.precision_ref, ctx);
-  }
-
- public:
-  template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    format_str_ = ctx.begin();
-    // Checks are deferred to formatting time when the argument type is known.
-    detail::dynamic_specs_handler<ParseContext> handler(specs_, ctx);
-    return detail::parse_format_specs(ctx.begin(), ctx.end(), handler);
-  }
-
-  template <typename T, typename FormatContext>
-  auto format(const T& val, FormatContext& ctx) -> decltype(ctx.out()) {
-    handle_specs(ctx);
-    detail::specs_checker<null_handler> checker(
-        null_handler(), detail::mapped_type_constant<T, FormatContext>::value);
-    checker.on_align(specs_.align);
-    if (specs_.sign != sign::none) checker.on_sign(specs_.sign);
-    if (specs_.alt) checker.on_hash();
-    if (specs_.precision >= 0) checker.end_precision();
-    return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
-  }
-};
+template <typename Char, size_t N>
+struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {};
 
 /**
-  \rst
-  Converts ``p`` to ``const void*`` for pointer formatting.
-
-  **Example**::
-
-    auto s = fmt::format("{}", fmt::ptr(p));
-  \endrst
+ * Converts `p` to `const void*` for pointer formatting.
+ *
+ * **Example**:
+ *
+ *     auto s = fmt::format("{}", fmt::ptr(p));
  */
 template <typename T> auto ptr(T p) -> const void* {
   static_assert(std::is_pointer<T>::value, "");
   return detail::bit_cast<const void*>(p);
 }
-template <typename T> auto ptr(const std::unique_ptr<T>& p) -> const void* {
-  return p.get();
-}
-template <typename T> auto ptr(const std::shared_ptr<T>& p) -> const void* {
-  return p.get();
-}
 
 /**
-  \rst
-  Converts ``e`` to the underlying type.
-
-  **Example**::
-
-    enum class color { red, green, blue };
-    auto s = fmt::format("{}", fmt::underlying(color::red));
-  \endrst
+ * Converts `e` to the underlying type.
+ *
+ * **Example**:
+ *
+ *     enum class color { red, green, blue };
+ *     auto s = fmt::format("{}", fmt::underlying(color::red));
  */
 template <typename Enum>
 constexpr auto underlying(Enum e) noexcept -> underlying_t<Enum> {
@@ -3930,42 +4051,39 @@ class bytes {
 
 template <> struct formatter<bytes> {
  private:
-  detail::dynamic_format_specs<char> specs_;
+  detail::dynamic_format_specs<> specs_;
 
  public:
   template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    using handler_type = detail::dynamic_specs_handler<ParseContext>;
-    detail::specs_checker<handler_type> handler(handler_type(specs_, ctx),
-                                                detail::type::string_type);
-    auto it = parse_format_specs(ctx.begin(), ctx.end(), handler);
-    detail::check_string_type_spec(specs_.type, ctx.error_handler());
-    return it;
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const char* {
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type::string_type);
   }
 
   template <typename FormatContext>
-  auto format(bytes b, FormatContext& ctx) -> decltype(ctx.out()) {
-    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
-                                                       specs_.width_ref, ctx);
+  auto format(bytes b, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto specs = specs_;
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                       specs.width_ref, ctx);
     detail::handle_dynamic_spec<detail::precision_checker>(
-        specs_.precision, specs_.precision_ref, ctx);
-    return detail::write_bytes(ctx.out(), b.data_, specs_);
+        specs.precision, specs.precision_ref, ctx);
+    return detail::write_bytes<char>(ctx.out(), b.data_, specs);
   }
 };
 
 // group_digits_view is not derived from view because it copies the argument.
-template <typename T> struct group_digits_view { T value; };
+template <typename T> struct group_digits_view {
+  T value;
+};
 
 /**
-  \rst
-  Returns a view that formats an integer value using ',' as a locale-independent
-  thousands separator.
-
-  **Example**::
-
-    fmt::print("{}", fmt::group_digits(12345));
-    // Output: "12,345"
-  \endrst
+ * Returns a view that formats an integer value using ',' as a
+ * locale-independent thousands separator.
+ *
+ * **Example**:
+ *
+ *     fmt::print("{}", fmt::group_digits(12345));
+ *     // Output: "12,345"
  */
 template <typename T> auto group_digits(T value) -> group_digits_view<T> {
   return {value};
@@ -3973,140 +4091,97 @@ template <typename T> auto group_digits(T value) -> group_digits_view<T> {
 
 template <typename T> struct formatter<group_digits_view<T>> : formatter<T> {
  private:
-  detail::dynamic_format_specs<char> specs_;
+  detail::dynamic_format_specs<> specs_;
 
  public:
   template <typename ParseContext>
-  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    using handler_type = detail::dynamic_specs_handler<ParseContext>;
-    detail::specs_checker<handler_type> handler(handler_type(specs_, ctx),
-                                                detail::type::int_type);
-    auto it = parse_format_specs(ctx.begin(), ctx.end(), handler);
-    detail::check_string_type_spec(specs_.type, ctx.error_handler());
-    return it;
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const char* {
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type::int_type);
   }
 
   template <typename FormatContext>
-  auto format(group_digits_view<T> t, FormatContext& ctx)
+  auto format(group_digits_view<T> t, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
-                                                       specs_.width_ref, ctx);
+    auto specs = specs_;
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                       specs.width_ref, ctx);
     detail::handle_dynamic_spec<detail::precision_checker>(
-        specs_.precision, specs_.precision_ref, ctx);
+        specs.precision, specs.precision_ref, ctx);
+    auto arg = detail::make_write_int_arg(t.value, specs.sign);
     return detail::write_int(
-        ctx.out(), static_cast<detail::uint64_or_128_t<T>>(t.value), 0, specs_,
-        detail::digit_grouping<char>("\3", ","));
+        ctx.out(), static_cast<detail::uint64_or_128_t<T>>(arg.abs_value),
+        arg.prefix, specs, detail::digit_grouping<char>("\3", ","));
   }
 };
 
-template <typename It, typename Sentinel, typename Char = char>
-struct join_view : detail::view {
-  It begin;
-  Sentinel end;
-  basic_string_view<Char> sep;
-
-  join_view(It b, Sentinel e, basic_string_view<Char> s)
-      : begin(b), end(e), sep(s) {}
+template <typename T, typename Char> struct nested_view {
+  const formatter<T, Char>* fmt;
+  const T* value;
 };
 
-template <typename It, typename Sentinel, typename Char>
-struct formatter<join_view<It, Sentinel, Char>, Char> {
- private:
-  using value_type =
-#ifdef __cpp_lib_ranges
-      std::iter_value_t<It>;
-#else
-      typename std::iterator_traits<It>::value_type;
-#endif
-  using context = buffer_context<Char>;
-  using mapper = detail::arg_mapper<context>;
-
-  template <typename T, FMT_ENABLE_IF(has_formatter<T, context>::value)>
-  static auto map(const T& value) -> const T& {
-    return value;
-  }
-  template <typename T, FMT_ENABLE_IF(!has_formatter<T, context>::value)>
-  static auto map(const T& value) -> decltype(mapper().map(value)) {
-    return mapper().map(value);
-  }
-
-  using formatter_type =
-      conditional_t<is_formattable<value_type, Char>::value,
-                    formatter<remove_cvref_t<decltype(map(
-                                  std::declval<const value_type&>()))>,
-                              Char>,
-                    detail::fallback_formatter<value_type, Char>>;
-
-  formatter_type value_formatter_;
-
- public:
+template <typename T, typename Char>
+struct formatter<nested_view<T, Char>, Char> {
   template <typename ParseContext>
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    return value_formatter_.parse(ctx);
+    return ctx.begin();
   }
-
   template <typename FormatContext>
-  auto format(const join_view<It, Sentinel, Char>& value,
-              FormatContext& ctx) const -> decltype(ctx.out()) {
-    auto it = value.begin;
-    auto out = ctx.out();
-    if (it != value.end) {
-      out = value_formatter_.format(map(*it), ctx);
-      ++it;
-      while (it != value.end) {
-        out = detail::copy_str<Char>(value.sep.begin(), value.sep.end(), out);
-        ctx.advance_to(out);
-        out = value_formatter_.format(map(*it), ctx);
-        ++it;
-      }
-    }
-    return out;
+  auto format(nested_view<T, Char> view, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return view.fmt->format(*view.value, ctx);
   }
 };
 
-/**
-  Returns a view that formats the iterator range `[begin, end)` with elements
-  separated by `sep`.
- */
-template <typename It, typename Sentinel>
-auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
-  return {begin, end, sep};
-}
-
-/**
-  \rst
-  Returns a view that formats `range` with elements separated by `sep`.
-
-  **Example**::
-
-    std::vector<int> v = {1, 2, 3};
-    fmt::print("{}", fmt::join(v, ", "));
-    // Output: "1, 2, 3"
-
-  ``fmt::join`` applies passed format specifiers to the range elements::
+template <typename T, typename Char = char> struct nested_formatter {
+ private:
+  int width_;
+  detail::fill_t fill_;
+  align_t align_ : 4;
+  formatter<T, Char> formatter_;
 
-    fmt::print("{:02}", fmt::join(v, ", "));
-    // Output: "01, 02, 03"
-  \endrst
- */
-template <typename Range>
-auto join(Range&& range, string_view sep)
-    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>> {
-  return join(std::begin(range), std::end(range), sep);
-}
+ public:
+  constexpr nested_formatter() : width_(0), align_(align_t::none) {}
+
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto specs = detail::dynamic_format_specs<Char>();
+    auto it = parse_format_specs(ctx.begin(), ctx.end(), specs, ctx,
+                                 detail::type::none_type);
+    width_ = specs.width;
+    fill_ = specs.fill;
+    align_ = specs.align;
+    ctx.advance_to(it);
+    return formatter_.parse(ctx);
+  }
+
+  template <typename FormatContext, typename F>
+  auto write_padded(FormatContext& ctx, F write) const -> decltype(ctx.out()) {
+    if (width_ == 0) return write(ctx.out());
+    auto buf = basic_memory_buffer<Char>();
+    write(basic_appender<Char>(buf));
+    auto specs = format_specs();
+    specs.width = width_;
+    specs.fill = fill_;
+    specs.align = align_;
+    return detail::write<Char>(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+  }
+
+  auto nested(const T& value) const -> nested_view<T, Char> {
+    return nested_view<T, Char>{&formatter_, &value};
+  }
+};
 
 /**
-  \rst
-  Converts *value* to ``std::string`` using the default format for type *T*.
-
-  **Example**::
-
-    #include <fmt/format.h>
-
-    std::string answer = fmt::to_string(42);
-  \endrst
+ * Converts `value` to `std::string` using the default format for type `T`.
+ *
+ * **Example**:
+ *
+ *     std::string answer = fmt::to_string(42);
  */
-template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    !detail::has_format_as<T>::value)>
 inline auto to_string(const T& value) -> std::string {
   auto buffer = memory_buffer();
   detail::write<char>(appender(buffer), value);
@@ -4131,40 +4206,33 @@ FMT_NODISCARD auto to_string(const basic_memory_buffer<Char, SIZE>& buf)
   return std::basic_string<Char>(buf.data(), size);
 }
 
-FMT_BEGIN_DETAIL_NAMESPACE
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    detail::has_format_as<T>::value)>
+inline auto to_string(const T& value) -> std::string {
+  return to_string(format_as(value));
+}
+
+FMT_END_EXPORT
+
+namespace detail {
 
 template <typename Char>
 void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
-                basic_format_args<FMT_BUFFER_CONTEXT(Char)> args,
-                locale_ref loc) {
-  // workaround for msvc bug regarding name-lookup in module
-  // link names into function scope
-  using detail::arg_formatter;
-  using detail::buffer_appender;
-  using detail::custom_formatter;
-  using detail::default_arg_formatter;
-  using detail::get_arg;
-  using detail::locale_ref;
-  using detail::parse_format_specs;
-  using detail::specs_checker;
-  using detail::specs_handler;
-  using detail::to_unsigned;
-  using detail::type;
-  using detail::write;
-  auto out = buffer_appender<Char>(buf);
+                typename vformat_args<Char>::type args, locale_ref loc) {
+  auto out = basic_appender<Char>(buf);
   if (fmt.size() == 2 && equal2(fmt.data(), "{}")) {
     auto arg = args.get(0);
-    if (!arg) error_handler().on_error("argument not found");
-    visit_format_arg(default_arg_formatter<Char>{out, args, loc}, arg);
+    if (!arg) report_error("argument not found");
+    arg.visit(default_arg_formatter<Char>{out, args, loc});
     return;
   }
 
-  struct format_handler : error_handler {
+  struct format_handler {
     basic_format_parse_context<Char> parse_context;
-    buffer_context<Char> context;
+    buffered_context<Char> context;
 
-    format_handler(buffer_appender<Char> p_out, basic_string_view<Char> str,
-                   basic_format_args<buffer_context<Char>> p_args,
+    format_handler(basic_appender<Char> p_out, basic_string_view<Char> str,
+                   basic_format_args<buffered_context<Char>> p_args,
                    locale_ref p_loc)
         : parse_context(str), context(p_out, p_args, p_loc) {}
 
@@ -4177,49 +4245,52 @@ void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
       return parse_context.next_arg_id();
     }
     FMT_CONSTEXPR auto on_arg_id(int id) -> int {
-      return parse_context.check_arg_id(id), id;
+      parse_context.check_arg_id(id);
+      return id;
     }
     FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+      parse_context.check_arg_id(id);
       int arg_id = context.arg_id(id);
-      if (arg_id < 0) on_error("argument not found");
+      if (arg_id < 0) report_error("argument not found");
       return arg_id;
     }
 
     FMT_INLINE void on_replacement_field(int id, const Char*) {
       auto arg = get_arg(context, id);
-      context.advance_to(visit_format_arg(
-          default_arg_formatter<Char>{context.out(), context.args(),
-                                      context.locale()},
-          arg));
+      context.advance_to(arg.visit(default_arg_formatter<Char>{
+          context.out(), context.args(), context.locale()}));
     }
 
     auto on_format_specs(int id, const Char* begin, const Char* end)
         -> const Char* {
       auto arg = get_arg(context, id);
-      if (arg.type() == type::custom_type) {
-        parse_context.advance_to(parse_context.begin() +
-                                 (begin - &*parse_context.begin()));
-        visit_format_arg(custom_formatter<Char>{parse_context, context}, arg);
+      // Not using a visitor for custom types gives better codegen.
+      if (arg.format_custom(begin, parse_context, context))
         return parse_context.begin();
-      }
-      auto specs = basic_format_specs<Char>();
-      specs_checker<specs_handler<Char>> handler(
-          specs_handler<Char>(specs, parse_context, context), arg.type());
-      begin = parse_format_specs(begin, end, handler);
+      auto specs = detail::dynamic_format_specs<Char>();
+      begin = parse_format_specs(begin, end, specs, parse_context, arg.type());
+      detail::handle_dynamic_spec<detail::width_checker>(
+          specs.width, specs.width_ref, context);
+      detail::handle_dynamic_spec<detail::precision_checker>(
+          specs.precision, specs.precision_ref, context);
       if (begin == end || *begin != '}')
-        on_error("missing '}' in format string");
-      auto f = arg_formatter<Char>{context.out(), specs, context.locale()};
-      context.advance_to(visit_format_arg(f, arg));
+        report_error("missing '}' in format string");
+      context.advance_to(arg.visit(
+          arg_formatter<Char>{context.out(), specs, context.locale()}));
       return begin;
     }
+
+    FMT_NORETURN void on_error(const char* message) { report_error(message); }
   };
   detail::parse_format_string<false>(fmt, format_handler(out, fmt, args, loc));
 }
 
+FMT_BEGIN_EXPORT
+
 #ifndef FMT_HEADER_ONLY
-extern template FMT_API void vformat_to(
-    buffer<char>&, string_view, basic_format_args<FMT_BUFFER_CONTEXT(char)>,
-    locale_ref);
+extern template FMT_API void vformat_to(buffer<char>&, string_view,
+                                        typename vformat_args<>::type,
+                                        locale_ref);
 extern template FMT_API auto thousands_sep_impl<char>(locale_ref)
     -> thousands_sep_result<char>;
 extern template FMT_API auto thousands_sep_impl<wchar_t>(locale_ref)
@@ -4228,19 +4299,41 @@ extern template FMT_API auto decimal_point_impl(locale_ref) -> char;
 extern template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
 #endif  // FMT_HEADER_ONLY
 
-FMT_END_DETAIL_NAMESPACE
+FMT_END_EXPORT
+
+template <typename T, typename Char, type TYPE>
+template <typename FormatContext>
+FMT_CONSTEXPR FMT_INLINE auto native_formatter<T, Char, TYPE>::format(
+    const T& val, FormatContext& ctx) const -> decltype(ctx.out()) {
+  if (specs_.width_ref.kind == arg_id_kind::none &&
+      specs_.precision_ref.kind == arg_id_kind::none) {
+    return write<Char>(ctx.out(), val, specs_, ctx.locale());
+  }
+  auto specs = specs_;
+  handle_dynamic_spec<width_checker>(specs.width, specs.width_ref, ctx);
+  handle_dynamic_spec<precision_checker>(specs.precision, specs.precision_ref,
+                                         ctx);
+  return write<Char>(ctx.out(), val, specs, ctx.locale());
+}
+
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+template <typename Char>
+struct formatter<detail::float128, Char>
+    : detail::native_formatter<detail::float128, Char,
+                               detail::type::float_type> {};
 
 #if FMT_USE_USER_DEFINED_LITERALS
 inline namespace literals {
 /**
-  \rst
-  User-defined literal equivalent of :func:`fmt::arg`.
-
-  **Example**::
-
-    using namespace fmt::literals;
-    fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
-  \endrst
+ * User-defined literal equivalent of `fmt::arg`.
+ *
+ * **Example**:
+ *
+ *     using namespace fmt::literals;
+ *     fmt::print("The answer is {answer}.", "answer"_a=42);
  */
 #  if FMT_USE_NONTYPE_TEMPLATE_ARGS
 template <detail_exported::fixed_string Str> constexpr auto operator""_a() {
@@ -4248,13 +4341,30 @@ template <detail_exported::fixed_string Str> constexpr auto operator""_a() {
   return detail::udl_arg<char_t, sizeof(Str.data) / sizeof(char_t), Str>();
 }
 #  else
-constexpr auto operator"" _a(const char* s, size_t) -> detail::udl_arg<char> {
+constexpr auto operator""_a(const char* s, size_t) -> detail::udl_arg<char> {
   return {s};
 }
 #  endif
 }  // namespace literals
 #endif  // FMT_USE_USER_DEFINED_LITERALS
 
+FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
+
+/**
+ * Formats `args` according to specifications in `fmt` and returns the result
+ * as a string.
+ *
+ * **Example**:
+ *
+ *     #include <fmt/format.h>
+ *     std::string message = fmt::format("The answer is {}.", 42);
+ */
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return vformat(fmt, fmt::make_format_args(args...));
+}
+
 template <typename Locale, FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
 inline auto vformat(const Locale& loc, string_view fmt, format_args args)
     -> std::string {
@@ -4265,7 +4375,7 @@ template <typename Locale, typename... T,
           FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
 inline auto format(const Locale& loc, format_string<T...> fmt, T&&... args)
     -> std::string {
-  return vformat(loc, string_view(fmt), fmt::make_format_args(args...));
+  return fmt::vformat(loc, string_view(fmt), fmt::make_format_args(args...));
 }
 
 template <typename OutputIt, typename Locale,
@@ -4293,13 +4403,13 @@ FMT_NODISCARD FMT_INLINE auto formatted_size(const Locale& loc,
                                              format_string<T...> fmt,
                                              T&&... args) -> size_t {
   auto buf = detail::counting_buffer<>();
-  detail::vformat_to(buf, string_view(fmt),
-                     format_args(fmt::make_format_args(args...)),
-                     detail::locale_ref(loc));
+  detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...),
+                           detail::locale_ref(loc));
   return buf.count();
 }
 
-FMT_MODULE_EXPORT_END
+FMT_END_EXPORT
+
 FMT_END_NAMESPACE
 
 #ifdef FMT_HEADER_ONLY
@@ -4309,4 +4419,9 @@ FMT_END_NAMESPACE
 #  define FMT_FUNC
 #endif
 
+// Restore _LIBCPP_REMOVE_TRANSITIVE_INCLUDES.
+#ifdef FMT_REMOVE_TRANSITIVE_INCLUDES
+#  undef _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#endif
+
 #endif  // FMT_FORMAT_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/os.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/os.h
index 8e697ec4e6fa..5c85ea08ff4e 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/os.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/os.h
@@ -8,16 +8,18 @@
 #ifndef FMT_OS_H_
 #define FMT_OS_H_
 
-#include <cerrno>
-#include <cstddef>
-#include <cstdio>
-#include <system_error>  // std::system_error
+#include "format.h"
 
-#if defined __APPLE__ || defined(__FreeBSD__)
-#  include <xlocale.h>  // for LC_NUMERIC_MASK on OS X
-#endif
+#ifndef FMT_MODULE
+#  include <cerrno>
+#  include <cstddef>
+#  include <cstdio>
+#  include <system_error>  // std::system_error
 
-#include "format.h"
+#  if FMT_HAS_INCLUDE(<xlocale.h>)
+#    include <xlocale.h>  // LC_NUMERIC_MASK on macOS
+#  endif
+#endif  // FMT_MODULE
 
 #ifndef FMT_USE_FCNTL
 // UWP doesn't provide _pipe.
@@ -46,6 +48,7 @@
 
 // Calls to system functions are wrapped in FMT_SYSTEM for testability.
 #ifdef FMT_SYSTEM
+#  define FMT_HAS_SYSTEM
 #  define FMT_POSIX_CALL(call) FMT_SYSTEM(call)
 #else
 #  define FMT_SYSTEM(call) ::call
@@ -71,50 +74,37 @@
 #define FMT_RETRY(result, expression) FMT_RETRY_VAL(result, expression, -1)
 
 FMT_BEGIN_NAMESPACE
-FMT_MODULE_EXPORT_BEGIN
+FMT_BEGIN_EXPORT
 
 /**
-  \rst
-  A reference to a null-terminated string. It can be constructed from a C
-  string or ``std::string``.
-
-  You can use one of the following type aliases for common character types:
-
-  +---------------+-----------------------------+
-  | Type          | Definition                  |
-  +===============+=============================+
-  | cstring_view  | basic_cstring_view<char>    |
-  +---------------+-----------------------------+
-  | wcstring_view | basic_cstring_view<wchar_t> |
-  +---------------+-----------------------------+
-
-  This class is most useful as a parameter type to allow passing
-  different types of strings to a function, for example::
-
-    template <typename... Args>
-    std::string format(cstring_view format_str, const Args & ... args);
-
-    format("{}", 42);
-    format(std::string("{}"), 42);
-  \endrst
+ * A reference to a null-terminated string. It can be constructed from a C
+ * string or `std::string`.
+ *
+ * You can use one of the following type aliases for common character types:
+ *
+ * +---------------+-----------------------------+
+ * | Type          | Definition                  |
+ * +===============+=============================+
+ * | cstring_view  | basic_cstring_view<char>    |
+ * +---------------+-----------------------------+
+ * | wcstring_view | basic_cstring_view<wchar_t> |
+ * +---------------+-----------------------------+
+ *
+ * This class is most useful as a parameter type for functions that wrap C APIs.
  */
 template <typename Char> class basic_cstring_view {
  private:
   const Char* data_;
 
  public:
-  /** Constructs a string reference object from a C string. */
+  /// Constructs a string reference object from a C string.
   basic_cstring_view(const Char* s) : data_(s) {}
 
-  /**
-    \rst
-    Constructs a string reference from an ``std::string`` object.
-    \endrst
-   */
+  /// Constructs a string reference from an `std::string` object.
   basic_cstring_view(const std::basic_string<Char>& s) : data_(s.c_str()) {}
 
-  /** Returns the pointer to a C string. */
-  const Char* c_str() const { return data_; }
+  /// Returns the pointer to a C string.
+  auto c_str() const -> const Char* { return data_; }
 };
 
 using cstring_view = basic_cstring_view<char>;
@@ -123,62 +113,39 @@ using wcstring_view = basic_cstring_view<wchar_t>;
 #ifdef _WIN32
 FMT_API const std::error_category& system_category() noexcept;
 
-FMT_BEGIN_DETAIL_NAMESPACE
-// A converter from UTF-16 to UTF-8.
-// It is only provided for Windows since other systems support UTF-8 natively.
-class utf16_to_utf8 {
- private:
-  memory_buffer buffer_;
-
- public:
-  utf16_to_utf8() {}
-  FMT_API explicit utf16_to_utf8(basic_string_view<wchar_t> s);
-  operator string_view() const { return string_view(&buffer_[0], size()); }
-  size_t size() const { return buffer_.size() - 1; }
-  const char* c_str() const { return &buffer_[0]; }
-  std::string str() const { return std::string(&buffer_[0], size()); }
-
-  // Performs conversion returning a system error code instead of
-  // throwing exception on conversion error. This method may still throw
-  // in case of memory allocation error.
-  FMT_API int convert(basic_string_view<wchar_t> s);
-};
-
+namespace detail {
 FMT_API void format_windows_error(buffer<char>& out, int error_code,
                                   const char* message) noexcept;
-FMT_END_DETAIL_NAMESPACE
+}
 
 FMT_API std::system_error vwindows_error(int error_code, string_view format_str,
                                          format_args args);
 
 /**
- \rst
- Constructs a :class:`std::system_error` object with the description
- of the form
-
- .. parsed-literal::
-   *<message>*: *<system-message>*
-
- where *<message>* is the formatted message and *<system-message>* is the
- system message corresponding to the error code.
- *error_code* is a Windows error code as given by ``GetLastError``.
- If *error_code* is not a valid error code such as -1, the system message
- will look like "error -1".
-
- **Example**::
-
-   // This throws a system_error with the description
-   //   cannot open file 'madeup': The system cannot find the file specified.
-   // or similar (system message may vary).
-   const char *filename = "madeup";
-   LPOFSTRUCT of = LPOFSTRUCT();
-   HFILE file = OpenFile(filename, &of, OF_READ);
-   if (file == HFILE_ERROR) {
-     throw fmt::windows_error(GetLastError(),
-                              "cannot open file '{}'", filename);
-   }
- \endrst
-*/
+ * Constructs a `std::system_error` object with the description of the form
+ *
+ *     <message>: <system-message>
+ *
+ * where `<message>` is the formatted message and `<system-message>` is the
+ * system message corresponding to the error code.
+ * `error_code` is a Windows error code as given by `GetLastError`.
+ * If `error_code` is not a valid error code such as -1, the system message
+ * will look like "error -1".
+ *
+ * **Example**:
+ *
+ *     // This throws a system_error with the description
+ *     //   cannot open file 'madeup': The system cannot find the file
+ * specified.
+ *     // or similar (system message may vary).
+ *     const char *filename = "madeup";
+ *     LPOFSTRUCT of = LPOFSTRUCT();
+ *     HFILE file = OpenFile(filename, &of, OF_READ);
+ *     if (file == HFILE_ERROR) {
+ *       throw fmt::windows_error(GetLastError(),
+ *                                "cannot open file '{}'", filename);
+ *     }
+ */
 template <typename... Args>
 std::system_error windows_error(int error_code, string_view message,
                                 const Args&... args) {
@@ -189,7 +156,7 @@ std::system_error windows_error(int error_code, string_view message,
 // Can be used to report errors from destructors.
 FMT_API void report_windows_error(int error_code, const char* message) noexcept;
 #else
-inline const std::error_category& system_category() noexcept {
+inline auto system_category() noexcept -> const std::error_category& {
   return std::system_category();
 }
 #endif  // _WIN32
@@ -226,7 +193,7 @@ class buffered_file {
     other.file_ = nullptr;
   }
 
-  buffered_file& operator=(buffered_file&& other) {
+  auto operator=(buffered_file&& other) -> buffered_file& {
     close();
     file_ = other.file_;
     other.file_ = nullptr;
@@ -240,21 +207,20 @@ class buffered_file {
   FMT_API void close();
 
   // Returns the pointer to a FILE object representing this file.
-  FILE* get() const noexcept { return file_; }
-
-  FMT_API int descriptor() const;
+  auto get() const noexcept -> FILE* { return file_; }
 
-  void vprint(string_view format_str, format_args args) {
-    fmt::vprint(file_, format_str, args);
-  }
+  FMT_API auto descriptor() const -> int;
 
-  template <typename... Args>
-  inline void print(string_view format_str, const Args&... args) {
-    vprint(format_str, fmt::make_format_args(args...));
+  template <typename... T>
+  inline void print(string_view fmt, const T&... args) {
+    const auto& vargs = fmt::make_format_args(args...);
+    detail::is_locking<T...>() ? fmt::vprint_buffered(file_, fmt, vargs)
+                               : fmt::vprint(file_, fmt, vargs);
   }
 };
 
 #if FMT_USE_FCNTL
+
 // A file. Closed file is represented by a file object with descriptor -1.
 // Methods that are not declared with noexcept may throw
 // fmt::system_error in case of failure. Note that some errors such as
@@ -268,6 +234,8 @@ class FMT_API file {
   // Constructs a file object with a given descriptor.
   explicit file(int fd) : fd_(fd) {}
 
+  friend struct pipe;
+
  public:
   // Possible values for the oflag argument to the constructor.
   enum {
@@ -292,7 +260,7 @@ class FMT_API file {
   file(file&& other) noexcept : fd_(other.fd_) { other.fd_ = -1; }
 
   // Move assignment is not noexcept because close may throw.
-  file& operator=(file&& other) {
+  auto operator=(file&& other) -> file& {
     close();
     fd_ = other.fd_;
     other.fd_ = -1;
@@ -303,24 +271,24 @@ class FMT_API file {
   ~file() noexcept;
 
   // Returns the file descriptor.
-  int descriptor() const noexcept { return fd_; }
+  auto descriptor() const noexcept -> int { return fd_; }
 
   // Closes the file.
   void close();
 
   // Returns the file size. The size has signed type for consistency with
   // stat::st_size.
-  long long size() const;
+  auto size() const -> long long;
 
   // Attempts to read count bytes from the file into the specified buffer.
-  size_t read(void* buffer, size_t count);
+  auto read(void* buffer, size_t count) -> size_t;
 
   // Attempts to write count bytes from the specified buffer to the file.
-  size_t write(const void* buffer, size_t count);
+  auto write(const void* buffer, size_t count) -> size_t;
 
   // Duplicates a file descriptor with the dup function and returns
   // the duplicate as a file object.
-  static file dup(int fd);
+  static auto dup(int fd) -> file;
 
   // Makes fd be the copy of this file descriptor, closing fd first if
   // necessary.
@@ -330,24 +298,35 @@ class FMT_API file {
   // necessary.
   void dup2(int fd, std::error_code& ec) noexcept;
 
-  // Creates a pipe setting up read_end and write_end file objects for reading
-  // and writing respectively.
-  static void pipe(file& read_end, file& write_end);
-
   // Creates a buffered_file object associated with this file and detaches
   // this file object from the file.
-  buffered_file fdopen(const char* mode);
+  auto fdopen(const char* mode) -> buffered_file;
+
+#  if defined(_WIN32) && !defined(__MINGW32__)
+  // Opens a file and constructs a file object representing this file by
+  // wcstring_view filename. Windows only.
+  static file open_windows_file(wcstring_view path, int oflag);
+#  endif
+};
+
+struct FMT_API pipe {
+  file read_end;
+  file write_end;
+
+  // Creates a pipe setting up read_end and write_end file objects for reading
+  // and writing respectively.
+  pipe();
 };
 
 // Returns the memory page size.
-long getpagesize();
+auto getpagesize() -> long;
 
-FMT_BEGIN_DETAIL_NAMESPACE
+namespace detail {
 
 struct buffer_size {
   buffer_size() = default;
   size_t value = 0;
-  buffer_size operator=(size_t val) const {
+  auto operator=(size_t val) const -> buffer_size {
     auto bs = buffer_size();
     bs.value = val;
     return bs;
@@ -379,82 +358,82 @@ struct ostream_params {
 #  endif
 };
 
-FMT_END_DETAIL_NAMESPACE
-
-// Added {} below to work around default constructor error known to
-// occur in Xcode versions 7.2.1 and 8.2.1.
-constexpr detail::buffer_size buffer_size{};
-
-/** A fast output stream which is not thread-safe. */
-class FMT_API ostream final : private detail::buffer<char> {
+class file_buffer final : public buffer<char> {
  private:
   file file_;
 
-  void grow(size_t) override;
-
-  ostream(cstring_view path, const detail::ostream_params& params)
-      : file_(path, params.oflag) {
-    set(new char[params.buffer_size], params.buffer_size);
-  }
+  FMT_API static void grow(buffer<char>& buf, size_t);
 
  public:
-  ostream(ostream&& other)
-      : detail::buffer<char>(other.data(), other.size(), other.capacity()),
-        file_(std::move(other.file_)) {
-    other.clear();
-    other.set(nullptr, 0);
-  }
-  ~ostream() {
-    flush();
-    delete[] data();
-  }
+  FMT_API file_buffer(cstring_view path, const ostream_params& params);
+  FMT_API file_buffer(file_buffer&& other) noexcept;
+  FMT_API ~file_buffer();
 
   void flush() {
     if (size() == 0) return;
-    file_.write(data(), size());
+    file_.write(data(), size() * sizeof(data()[0]));
     clear();
   }
 
-  template <typename... T>
-  friend ostream output_file(cstring_view path, T... params);
-
   void close() {
     flush();
     file_.close();
   }
+};
+
+}  // namespace detail
+
+constexpr auto buffer_size = detail::buffer_size();
 
-  /**
-    Formats ``args`` according to specifications in ``fmt`` and writes the
-    output to the file.
-   */
+/// A fast output stream for writing from a single thread. Writing from
+/// multiple threads without external synchronization may result in a data race.
+class FMT_API ostream {
+ private:
+  FMT_MSC_WARNING(suppress : 4251)
+  detail::file_buffer buffer_;
+
+  ostream(cstring_view path, const detail::ostream_params& params)
+      : buffer_(path, params) {}
+
+ public:
+  ostream(ostream&& other) : buffer_(std::move(other.buffer_)) {}
+
+  ~ostream();
+
+  void flush() { buffer_.flush(); }
+
+  template <typename... T>
+  friend auto output_file(cstring_view path, T... params) -> ostream;
+
+  void close() { buffer_.close(); }
+
+  /// Formats `args` according to specifications in `fmt` and writes the
+  /// output to the file.
   template <typename... T> void print(format_string<T...> fmt, T&&... args) {
-    vformat_to(detail::buffer_appender<char>(*this), fmt,
-               fmt::make_format_args(args...));
+    vformat_to(appender(buffer_), fmt, fmt::make_format_args(args...));
   }
 };
 
 /**
-  \rst
-  Opens a file for writing. Supported parameters passed in *params*:
-
-  * ``<integer>``: Flags passed to `open
-    <https://pubs.opengroup.org/onlinepubs/007904875/functions/open.html>`_
-    (``file::WRONLY | file::CREATE | file::TRUNC`` by default)
-  * ``buffer_size=<integer>``: Output buffer size
-
-  **Example**::
-
-    auto out = fmt::output_file("guide.txt");
-    out.print("Don't {}", "Panic");
-  \endrst
+ * Opens a file for writing. Supported parameters passed in `params`:
+ *
+ * - `<integer>`: Flags passed to [open](
+ *   https://pubs.opengroup.org/onlinepubs/007904875/functions/open.html)
+ *   (`file::WRONLY | file::CREATE | file::TRUNC` by default)
+ * - `buffer_size=<integer>`: Output buffer size
+ *
+ * **Example**:
+ *
+ *     auto out = fmt::output_file("guide.txt");
+ *     out.print("Don't {}", "Panic");
  */
 template <typename... T>
-inline ostream output_file(cstring_view path, T... params) {
+inline auto output_file(cstring_view path, T... params) -> ostream {
   return {path, detail::ostream_params(params...)};
 }
 #endif  // FMT_USE_FCNTL
 
-FMT_MODULE_EXPORT_END
+FMT_END_EXPORT
 FMT_END_NAMESPACE
 
 #endif  // FMT_OS_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/ostream.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/ostream.h
index 86ec47e13ad7..98faef659f52 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/ostream.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/ostream.h
@@ -8,61 +8,29 @@
 #ifndef FMT_OSTREAM_H_
 #define FMT_OSTREAM_H_
 
-#include <fstream>
-#include <ostream>
-#if defined(_WIN32) && defined(__GLIBCXX__)
-#  include <ext/stdio_filebuf.h>
-#  include <ext/stdio_sync_filebuf.h>
-#elif defined(_WIN32) && defined(_LIBCPP_VERSION)
-#  include <__std_stream>
+#ifndef FMT_MODULE
+#  include <fstream>  // std::filebuf
 #endif
 
-#include "format.h"
-
-FMT_BEGIN_NAMESPACE
+#ifdef _WIN32
+#  ifdef __GLIBCXX__
+#    include <ext/stdio_filebuf.h>
+#    include <ext/stdio_sync_filebuf.h>
+#  endif
+#  include <io.h>
+#endif
 
-template <typename OutputIt, typename Char> class basic_printf_context;
+#include "chrono.h"  // formatbuf
 
+FMT_BEGIN_NAMESPACE
 namespace detail {
 
-// Checks if T has a user-defined operator<<.
-template <typename T, typename Char, typename Enable = void>
-class is_streamable {
- private:
-  template <typename U>
-  static auto test(int)
-      -> bool_constant<sizeof(std::declval<std::basic_ostream<Char>&>()
-                              << std::declval<U>()) != 0>;
-
-  template <typename> static auto test(...) -> std::false_type;
-
-  using result = decltype(test<T>(0));
-
- public:
-  is_streamable() = default;
-
-  static const bool value = result::value;
-};
-
-// Formatting of built-in types and arrays is intentionally disabled because
-// it's handled by standard (non-ostream) formatters.
-template <typename T, typename Char>
-struct is_streamable<
-    T, Char,
-    enable_if_t<
-        std::is_arithmetic<T>::value || std::is_array<T>::value ||
-        std::is_pointer<T>::value || std::is_same<T, char8_type>::value ||
-        std::is_convertible<T, fmt::basic_string_view<Char>>::value ||
-        std::is_same<T, std_string_view<Char>>::value ||
-        (std::is_convertible<T, int>::value && !std::is_enum<T>::value)>>
-    : std::false_type {};
-
 // Generate a unique explicit instantion in every translation unit using a tag
 // type in an anonymous namespace.
 namespace {
 struct file_access_tag {};
 }  // namespace
-template <class Tag, class BufType, FILE* BufType::*FileMemberPtr>
+template <typename Tag, typename BufType, FILE* BufType::*FileMemberPtr>
 class file_access {
   friend auto get_file(BufType& obj) -> FILE* { return obj.*FileMemberPtr; }
 };
@@ -71,36 +39,40 @@ class file_access {
 template class file_access<file_access_tag, std::filebuf,
                            &std::filebuf::_Myfile>;
 auto get_file(std::filebuf&) -> FILE*;
-#elif defined(_WIN32) && defined(_LIBCPP_VERSION)
-template class file_access<file_access_tag, std::__stdoutbuf<char>,
-                           &std::__stdoutbuf<char>::__file_>;
-auto get_file(std::__stdoutbuf<char>&) -> FILE*;
 #endif
 
-inline bool write_ostream_unicode(std::ostream& os, fmt::string_view data) {
-#if FMT_MSC_VERSION
+inline auto write_ostream_unicode(std::ostream& os, fmt::string_view data)
+    -> bool {
+  FILE* f = nullptr;
+#if FMT_MSC_VERSION && FMT_USE_RTTI
   if (auto* buf = dynamic_cast<std::filebuf*>(os.rdbuf()))
-    if (FILE* f = get_file(*buf)) return write_console(f, data);
-#elif defined(_WIN32) && defined(__GLIBCXX__)
+    f = get_file(*buf);
+  else
+    return false;
+#elif defined(_WIN32) && defined(__GLIBCXX__) && FMT_USE_RTTI
   auto* rdbuf = os.rdbuf();
-  FILE* c_file;
   if (auto* sfbuf = dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char>*>(rdbuf))
-    c_file = sfbuf->file();
+    f = sfbuf->file();
   else if (auto* fbuf = dynamic_cast<__gnu_cxx::stdio_filebuf<char>*>(rdbuf))
-    c_file = fbuf->file();
+    f = fbuf->file();
   else
     return false;
-  if (c_file) return write_console(c_file, data);
-#elif defined(_WIN32) && defined(_LIBCPP_VERSION)
-  if (auto* buf = dynamic_cast<std::__stdoutbuf<char>*>(os.rdbuf()))
-    if (FILE* f = get_file(*buf)) return write_console(f, data);
 #else
-  ignore_unused(os, data);
+  ignore_unused(os, data, f);
+#endif
+#ifdef _WIN32
+  if (f) {
+    int fd = _fileno(f);
+    if (_isatty(fd)) {
+      os.flush();
+      return write_console(fd, data);
+    }
+  }
 #endif
   return false;
 }
-inline bool write_ostream_unicode(std::wostream&,
-                                  fmt::basic_string_view<wchar_t>) {
+inline auto write_ostream_unicode(std::wostream&,
+                                  fmt::basic_string_view<wchar_t>) -> bool {
   return false;
 }
 
@@ -121,18 +93,19 @@ void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
 }
 
 template <typename Char, typename T>
-void format_value(buffer<Char>& buf, const T& value,
-                  locale_ref loc = locale_ref()) {
+void format_value(buffer<Char>& buf, const T& value) {
   auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
   auto&& output = std::basic_ostream<Char>(&format_buf);
 #if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
-  if (loc) output.imbue(loc.get<std::locale>());
+  output.imbue(std::locale::classic());  // The default is always unlocalized.
 #endif
   output << value;
   output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
 }
 
-template <typename T> struct streamed_view { const T& value; };
+template <typename T> struct streamed_view {
+  const T& value;
+};
 
 }  // namespace detail
 
@@ -141,11 +114,10 @@ template <typename Char>
 struct basic_ostream_formatter : formatter<basic_string_view<Char>, Char> {
   void set_debug_format() = delete;
 
-  template <typename T, typename OutputIt>
-  auto format(const T& value, basic_format_context<OutputIt, Char>& ctx) const
-      -> OutputIt {
+  template <typename T, typename Context>
+  auto format(const T& value, Context& ctx) const -> decltype(ctx.out()) {
     auto buffer = basic_memory_buffer<Char>();
-    format_value(buffer, value, ctx.locale());
+    detail::format_value(buffer, value);
     return formatter<basic_string_view<Char>, Char>::format(
         {buffer.data(), buffer.size()}, ctx);
   }
@@ -156,37 +128,28 @@ using ostream_formatter = basic_ostream_formatter<char>;
 template <typename T, typename Char>
 struct formatter<detail::streamed_view<T>, Char>
     : basic_ostream_formatter<Char> {
-  template <typename OutputIt>
-  auto format(detail::streamed_view<T> view,
-              basic_format_context<OutputIt, Char>& ctx) const -> OutputIt {
+  template <typename Context>
+  auto format(detail::streamed_view<T> view, Context& ctx) const
+      -> decltype(ctx.out()) {
     return basic_ostream_formatter<Char>::format(view.value, ctx);
   }
 };
 
 /**
-  \rst
-  Returns a view that formats `value` via an ostream ``operator<<``.
-
-  **Example**::
-
-    fmt::print("Current thread id: {}\n",
-               fmt::streamed(std::this_thread::get_id()));
-  \endrst
+ * Returns a view that formats `value` via an ostream `operator<<`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("Current thread id: {}\n",
+ *                fmt::streamed(std::this_thread::get_id()));
  */
 template <typename T>
-auto streamed(const T& value) -> detail::streamed_view<T> {
+constexpr auto streamed(const T& value) -> detail::streamed_view<T> {
   return {value};
 }
 
 namespace detail {
 
-// Formats an object of type T that has an overloaded ostream operator<<.
-template <typename T, typename Char>
-struct fallback_formatter<T, Char, enable_if_t<is_streamable<T, Char>::value>>
-    : basic_ostream_formatter<Char> {
-  using basic_ostream_formatter<Char>::format;
-};
-
 inline void vprint_directly(std::ostream& os, string_view format_str,
                             format_args args) {
   auto buffer = memory_buffer();
@@ -196,10 +159,10 @@ inline void vprint_directly(std::ostream& os, string_view format_str,
 
 }  // namespace detail
 
-FMT_MODULE_EXPORT template <typename Char>
+FMT_EXPORT template <typename Char>
 void vprint(std::basic_ostream<Char>& os,
             basic_string_view<type_identity_t<Char>> format_str,
-            basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+            typename detail::vformat_args<Char>::type args) {
   auto buffer = basic_memory_buffer<Char>();
   detail::vformat_to(buffer, format_str, args);
   if (detail::write_ostream_unicode(os, {buffer.data(), buffer.size()})) return;
@@ -207,29 +170,40 @@ void vprint(std::basic_ostream<Char>& os,
 }
 
 /**
-  \rst
-  Prints formatted data to the stream *os*.
-
-  **Example**::
-
-    fmt::print(cerr, "Don't {}!", "panic");
-  \endrst
+ * Prints formatted data to the stream `os`.
+ *
+ * **Example**:
+ *
+ *     fmt::print(cerr, "Don't {}!", "panic");
  */
-FMT_MODULE_EXPORT template <typename... T>
+FMT_EXPORT template <typename... T>
 void print(std::ostream& os, format_string<T...> fmt, T&&... args) {
   const auto& vargs = fmt::make_format_args(args...);
-  if (detail::is_utf8())
+  if (detail::use_utf8())
     vprint(os, fmt, vargs);
   else
     detail::vprint_directly(os, fmt, vargs);
 }
 
-FMT_MODULE_EXPORT
+FMT_EXPORT
 template <typename... Args>
 void print(std::wostream& os,
            basic_format_string<wchar_t, type_identity_t<Args>...> fmt,
            Args&&... args) {
-  vprint(os, fmt, fmt::make_format_args<buffer_context<wchar_t>>(args...));
+  vprint(os, fmt, fmt::make_format_args<buffered_context<wchar_t>>(args...));
+}
+
+FMT_EXPORT template <typename... T>
+void println(std::ostream& os, format_string<T...> fmt, T&&... args) {
+  fmt::print(os, "{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+FMT_EXPORT
+template <typename... Args>
+void println(std::wostream& os,
+             basic_format_string<wchar_t, type_identity_t<Args>...> fmt,
+             Args&&... args) {
+  print(os, L"{}\n", fmt::format(fmt, std::forward<Args>(args)...));
 }
 
 FMT_END_NAMESPACE
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/printf.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/printf.h
index 70a592dc26dd..072cc6b309d4 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/printf.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/printf.h
@@ -8,107 +8,103 @@
 #ifndef FMT_PRINTF_H_
 #define FMT_PRINTF_H_
 
-#include <algorithm>  // std::max
-#include <limits>     // std::numeric_limits
+#ifndef FMT_MODULE
+#  include <algorithm>  // std::max
+#  include <limits>     // std::numeric_limits
+#endif
 
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
-FMT_MODULE_EXPORT_BEGIN
+FMT_BEGIN_EXPORT
 
-template <typename T> struct printf_formatter { printf_formatter() = delete; };
-
-template <typename Char>
-class basic_printf_parse_context : public basic_format_parse_context<Char> {
-  using basic_format_parse_context<Char>::basic_format_parse_context;
+template <typename T> struct printf_formatter {
+  printf_formatter() = delete;
 };
 
-template <typename OutputIt, typename Char> class basic_printf_context {
+template <typename Char> class basic_printf_context {
  private:
-  OutputIt out_;
+  basic_appender<Char> out_;
   basic_format_args<basic_printf_context> args_;
 
+  static_assert(std::is_same<Char, char>::value ||
+                    std::is_same<Char, wchar_t>::value,
+                "Unsupported code unit type.");
+
  public:
   using char_type = Char;
-  using format_arg = basic_format_arg<basic_printf_context>;
-  using parse_context_type = basic_printf_parse_context<Char>;
+  using parse_context_type = basic_format_parse_context<Char>;
   template <typename T> using formatter_type = printf_formatter<T>;
 
-  /**
-    \rst
-    Constructs a ``printf_context`` object. References to the arguments are
-    stored in the context object so make sure they have appropriate lifetimes.
-    \endrst
-   */
-  basic_printf_context(OutputIt out,
+  /// Constructs a `printf_context` object. References to the arguments are
+  /// stored in the context object so make sure they have appropriate lifetimes.
+  basic_printf_context(basic_appender<Char> out,
                        basic_format_args<basic_printf_context> args)
       : out_(out), args_(args) {}
 
-  OutputIt out() { return out_; }
-  void advance_to(OutputIt it) { out_ = it; }
-
-  detail::locale_ref locale() { return {}; }
+  auto out() -> basic_appender<Char> { return out_; }
+  void advance_to(basic_appender<Char>) {}
 
-  format_arg arg(int id) const { return args_.get(id); }
+  auto locale() -> detail::locale_ref { return {}; }
 
-  FMT_CONSTEXPR void on_error(const char* message) {
-    detail::error_handler().on_error(message);
+  auto arg(int id) const -> basic_format_arg<basic_printf_context> {
+    return args_.get(id);
   }
 };
 
-FMT_BEGIN_DETAIL_NAMESPACE
+namespace detail {
 
 // Checks if a value fits in int - used to avoid warnings about comparing
 // signed and unsigned integers.
 template <bool IsSigned> struct int_checker {
-  template <typename T> static bool fits_in_int(T value) {
-    unsigned max = max_value<int>();
+  template <typename T> static auto fits_in_int(T value) -> bool {
+    unsigned max = to_unsigned(max_value<int>());
     return value <= max;
   }
-  static bool fits_in_int(bool) { return true; }
+  static auto fits_in_int(bool) -> bool { return true; }
 };
 
 template <> struct int_checker<true> {
-  template <typename T> static bool fits_in_int(T value) {
+  template <typename T> static auto fits_in_int(T value) -> bool {
     return value >= (std::numeric_limits<int>::min)() &&
            value <= max_value<int>();
   }
-  static bool fits_in_int(int) { return true; }
+  static auto fits_in_int(int) -> bool { return true; }
 };
 
-class printf_precision_handler {
- public:
+struct printf_precision_handler {
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-  int operator()(T value) {
+  auto operator()(T value) -> int {
     if (!int_checker<std::numeric_limits<T>::is_signed>::fits_in_int(value))
-      FMT_THROW(format_error("number is too big"));
+      report_error("number is too big");
     return (std::max)(static_cast<int>(value), 0);
   }
 
   template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
-  int operator()(T) {
-    FMT_THROW(format_error("precision is not integer"));
+  auto operator()(T) -> int {
+    report_error("precision is not integer");
     return 0;
   }
 };
 
 // An argument visitor that returns true iff arg is a zero integer.
-class is_zero_int {
- public:
+struct is_zero_int {
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-  bool operator()(T value) {
+  auto operator()(T value) -> bool {
     return value == 0;
   }
 
   template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
-  bool operator()(T) {
+  auto operator()(T) -> bool {
     return false;
   }
 };
 
 template <typename T> struct make_unsigned_or_bool : std::make_unsigned<T> {};
 
-template <> struct make_unsigned_or_bool<bool> { using type = bool; };
+template <> struct make_unsigned_or_bool<bool> {
+  using type = bool;
+};
 
 template <typename T, typename Context> class arg_converter {
  private:
@@ -132,22 +128,23 @@ template <typename T, typename Context> class arg_converter {
     if (const_check(sizeof(target_type) <= sizeof(int))) {
       // Extra casts are used to silence warnings.
       if (is_signed) {
-        arg_ = detail::make_arg<Context>(
-            static_cast<int>(static_cast<target_type>(value)));
+        auto n = static_cast<int>(static_cast<target_type>(value));
+        arg_ = detail::make_arg<Context>(n);
       } else {
         using unsigned_type = typename make_unsigned_or_bool<target_type>::type;
-        arg_ = detail::make_arg<Context>(
-            static_cast<unsigned>(static_cast<unsigned_type>(value)));
+        auto n = static_cast<unsigned>(static_cast<unsigned_type>(value));
+        arg_ = detail::make_arg<Context>(n);
       }
     } else {
       if (is_signed) {
         // glibc's printf doesn't sign extend arguments of smaller types:
         //   std::printf("%lld", -42);  // prints "4294967254"
         // but we don't have to do the same because it's a UB.
-        arg_ = detail::make_arg<Context>(static_cast<long long>(value));
+        auto n = static_cast<long long>(value);
+        arg_ = detail::make_arg<Context>(n);
       } else {
-        arg_ = detail::make_arg<Context>(
-            static_cast<typename make_unsigned_or_bool<U>::type>(value));
+        auto n = static_cast<typename make_unsigned_or_bool<U>::type>(value);
+        arg_ = detail::make_arg<Context>(n);
       }
     }
   }
@@ -162,7 +159,7 @@ template <typename T, typename Context> class arg_converter {
 // unsigned).
 template <typename T, typename Context, typename Char>
 void convert_arg(basic_format_arg<Context>& arg, Char type) {
-  visit_format_arg(arg_converter<T, Context>(arg, type), arg);
+  arg.visit(arg_converter<T, Context>(arg, type));
 }
 
 // Converts an integer argument to char for printf.
@@ -175,8 +172,8 @@ template <typename Context> class char_converter {
 
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
   void operator()(T value) {
-    arg_ = detail::make_arg<Context>(
-        static_cast<typename Context::char_type>(value));
+    auto c = static_cast<typename Context::char_type>(value);
+    arg_ = detail::make_arg<Context>(c);
   }
 
   template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
@@ -186,122 +183,126 @@ template <typename Context> class char_converter {
 // An argument visitor that return a pointer to a C string if argument is a
 // string or null otherwise.
 template <typename Char> struct get_cstring {
-  template <typename T> const Char* operator()(T) { return nullptr; }
-  const Char* operator()(const Char* s) { return s; }
+  template <typename T> auto operator()(T) -> const Char* { return nullptr; }
+  auto operator()(const Char* s) -> const Char* { return s; }
 };
 
 // Checks if an argument is a valid printf width specifier and sets
 // left alignment if it is negative.
-template <typename Char> class printf_width_handler {
+class printf_width_handler {
  private:
-  using format_specs = basic_format_specs<Char>;
-
   format_specs& specs_;
 
  public:
   explicit printf_width_handler(format_specs& specs) : specs_(specs) {}
 
   template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-  unsigned operator()(T value) {
+  auto operator()(T value) -> unsigned {
     auto width = static_cast<uint32_or_64_or_128_t<T>>(value);
     if (detail::is_negative(value)) {
       specs_.align = align::left;
       width = 0 - width;
     }
-    unsigned int_max = max_value<int>();
-    if (width > int_max) FMT_THROW(format_error("number is too big"));
+    unsigned int_max = to_unsigned(max_value<int>());
+    if (width > int_max) report_error("number is too big");
     return static_cast<unsigned>(width);
   }
 
   template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
-  unsigned operator()(T) {
-    FMT_THROW(format_error("width is not integer"));
+  auto operator()(T) -> unsigned {
+    report_error("width is not integer");
     return 0;
   }
 };
 
-// The ``printf`` argument formatter.
-template <typename OutputIt, typename Char>
+// Workaround for a bug with the XL compiler when initializing
+// printf_arg_formatter's base class.
+template <typename Char>
+auto make_arg_formatter(basic_appender<Char> iter, format_specs& s)
+    -> arg_formatter<Char> {
+  return {iter, s, locale_ref()};
+}
+
+// The `printf` argument formatter.
+template <typename Char>
 class printf_arg_formatter : public arg_formatter<Char> {
  private:
   using base = arg_formatter<Char>;
-  using context_type = basic_printf_context<OutputIt, Char>;
-  using format_specs = basic_format_specs<Char>;
+  using context_type = basic_printf_context<Char>;
 
   context_type& context_;
 
-  OutputIt write_null_pointer(bool is_string = false) {
+  void write_null_pointer(bool is_string = false) {
     auto s = this->specs;
     s.type = presentation_type::none;
-    return write_bytes(this->out, is_string ? "(null)" : "(nil)", s);
+    write_bytes<Char>(this->out, is_string ? "(null)" : "(nil)", s);
   }
 
  public:
-  printf_arg_formatter(OutputIt iter, format_specs& s, context_type& ctx)
-      : base{iter, s, locale_ref()}, context_(ctx) {}
+  printf_arg_formatter(basic_appender<Char> iter, format_specs& s,
+                       context_type& ctx)
+      : base(make_arg_formatter(iter, s)), context_(ctx) {}
 
-  OutputIt operator()(monostate value) { return base::operator()(value); }
+  void operator()(monostate value) { base::operator()(value); }
 
   template <typename T, FMT_ENABLE_IF(detail::is_integral<T>::value)>
-  OutputIt operator()(T value) {
+  void operator()(T value) {
     // MSVC2013 fails to compile separate overloads for bool and Char so use
     // std::is_same instead.
-    if (std::is_same<T, Char>::value) {
-      format_specs fmt_specs = this->specs;
-      if (fmt_specs.type != presentation_type::none &&
-          fmt_specs.type != presentation_type::chr) {
-        return (*this)(static_cast<int>(value));
-      }
-      fmt_specs.sign = sign::none;
-      fmt_specs.alt = false;
-      fmt_specs.fill[0] = ' ';  // Ignore '0' flag for char types.
-      // align::numeric needs to be overwritten here since the '0' flag is
-      // ignored for non-numeric types
-      if (fmt_specs.align == align::none || fmt_specs.align == align::numeric)
-        fmt_specs.align = align::right;
-      return write<Char>(this->out, static_cast<Char>(value), fmt_specs);
+    if (!std::is_same<T, Char>::value) {
+      base::operator()(value);
+      return;
     }
-    return base::operator()(value);
+    format_specs s = this->specs;
+    if (s.type != presentation_type::none && s.type != presentation_type::chr) {
+      return (*this)(static_cast<int>(value));
+    }
+    s.sign = sign::none;
+    s.alt = false;
+    s.fill = ' ';  // Ignore '0' flag for char types.
+    // align::numeric needs to be overwritten here since the '0' flag is
+    // ignored for non-numeric types
+    if (s.align == align::none || s.align == align::numeric)
+      s.align = align::right;
+    write<Char>(this->out, static_cast<Char>(value), s);
   }
 
   template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
-  OutputIt operator()(T value) {
-    return base::operator()(value);
+  void operator()(T value) {
+    base::operator()(value);
   }
 
-  /** Formats a null-terminated C string. */
-  OutputIt operator()(const char* value) {
-    if (value) return base::operator()(value);
-    return write_null_pointer(this->specs.type != presentation_type::pointer);
+  void operator()(const char* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer(this->specs.type != presentation_type::pointer);
   }
 
-  /** Formats a null-terminated wide C string. */
-  OutputIt operator()(const wchar_t* value) {
-    if (value) return base::operator()(value);
-    return write_null_pointer(this->specs.type != presentation_type::pointer);
+  void operator()(const wchar_t* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer(this->specs.type != presentation_type::pointer);
   }
 
-  OutputIt operator()(basic_string_view<Char> value) {
-    return base::operator()(value);
-  }
+  void operator()(basic_string_view<Char> value) { base::operator()(value); }
 
-  /** Formats a pointer. */
-  OutputIt operator()(const void* value) {
-    return value ? base::operator()(value) : write_null_pointer();
+  void operator()(const void* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer();
   }
 
-  /** Formats an argument of a custom (user-defined) type. */
-  OutputIt operator()(typename basic_format_arg<context_type>::handle handle) {
-    auto parse_ctx =
-        basic_printf_parse_context<Char>(basic_string_view<Char>());
+  void operator()(typename basic_format_arg<context_type>::handle handle) {
+    auto parse_ctx = basic_format_parse_context<Char>({});
     handle.format(parse_ctx, context_);
-    return this->out;
   }
 };
 
 template <typename Char>
-void parse_flags(basic_format_specs<Char>& specs, const Char*& it,
-                 const Char* end) {
+void parse_flags(format_specs& specs, const Char*& it, const Char* end) {
   for (; it != end; ++it) {
     switch (*it) {
     case '-':
@@ -311,12 +312,10 @@ void parse_flags(basic_format_specs<Char>& specs, const Char*& it,
       specs.sign = sign::plus;
       break;
     case '0':
-      specs.fill[0] = '0';
+      specs.fill = '0';
       break;
     case ' ':
-      if (specs.sign != sign::plus) {
-        specs.sign = sign::space;
-      }
+      if (specs.sign != sign::plus) specs.sign = sign::space;
       break;
     case '#':
       specs.alt = true;
@@ -328,8 +327,8 @@ void parse_flags(basic_format_specs<Char>& specs, const Char*& it,
 }
 
 template <typename Char, typename GetArg>
-int parse_header(const Char*& it, const Char* end,
-                 basic_format_specs<Char>& specs, GetArg get_arg) {
+auto parse_header(const Char*& it, const Char* end, format_specs& specs,
+                  GetArg get_arg) -> int {
   int arg_index = -1;
   Char c = *it;
   if (c >= '0' && c <= '9') {
@@ -340,11 +339,11 @@ int parse_header(const Char*& it, const Char* end,
       ++it;
       arg_index = value != -1 ? value : max_value<int>();
     } else {
-      if (c == '0') specs.fill[0] = '0';
+      if (c == '0') specs.fill = '0';
       if (value != 0) {
         // Nonzero value means that we parsed width and don't need to
         // parse it or flags again, so return now.
-        if (value == -1) FMT_THROW(format_error("number is too big"));
+        if (value == -1) report_error("number is too big");
         specs.width = value;
         return arg_index;
       }
@@ -355,23 +354,68 @@ int parse_header(const Char*& it, const Char* end,
   if (it != end) {
     if (*it >= '0' && *it <= '9') {
       specs.width = parse_nonnegative_int(it, end, -1);
-      if (specs.width == -1) FMT_THROW(format_error("number is too big"));
+      if (specs.width == -1) report_error("number is too big");
     } else if (*it == '*') {
       ++it;
-      specs.width = static_cast<int>(visit_format_arg(
-          detail::printf_width_handler<Char>(specs), get_arg(-1)));
+      specs.width = static_cast<int>(
+          get_arg(-1).visit(detail::printf_width_handler(specs)));
     }
   }
   return arg_index;
 }
 
+inline auto parse_printf_presentation_type(char c, type t, bool& upper)
+    -> presentation_type {
+  using pt = presentation_type;
+  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
+  switch (c) {
+  case 'd':
+    return in(t, integral_set) ? pt::dec : pt::none;
+  case 'o':
+    return in(t, integral_set) ? pt::oct : pt::none;
+  case 'X':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'x':
+    return in(t, integral_set) ? pt::hex : pt::none;
+  case 'E':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'e':
+    return in(t, float_set) ? pt::exp : pt::none;
+  case 'F':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'f':
+    return in(t, float_set) ? pt::fixed : pt::none;
+  case 'G':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'g':
+    return in(t, float_set) ? pt::general : pt::none;
+  case 'A':
+    upper = true;
+    FMT_FALLTHROUGH;
+  case 'a':
+    return in(t, float_set) ? pt::hexfloat : pt::none;
+  case 'c':
+    return in(t, integral_set) ? pt::chr : pt::none;
+  case 's':
+    return in(t, string_set | cstring_set) ? pt::string : pt::none;
+  case 'p':
+    return in(t, pointer_set | cstring_set) ? pt::pointer : pt::none;
+  default:
+    return pt::none;
+  }
+}
+
 template <typename Char, typename Context>
 void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
              basic_format_args<Context> args) {
-  using OutputIt = buffer_appender<Char>;
-  auto out = OutputIt(buf);
-  auto context = basic_printf_context<OutputIt, Char>(out, args);
-  auto parse_ctx = basic_printf_parse_context<Char>(format);
+  using iterator = basic_appender<Char>;
+  auto out = iterator(buf);
+  auto context = basic_printf_context<Char>(out, args);
+  auto parse_ctx = basic_format_parse_context<Char>(format);
 
   // Returns the argument with specified index or, if arg_index is -1, the next
   // argument.
@@ -387,26 +431,24 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
   const Char* end = parse_ctx.end();
   auto it = start;
   while (it != end) {
-    if (!detail::find<false, Char>(it, end, '%', it)) {
-      it = end;  // detail::find leaves it == nullptr if it doesn't find '%'
+    if (!find<false, Char>(it, end, '%', it)) {
+      it = end;  // find leaves it == nullptr if it doesn't find '%'.
       break;
     }
     Char c = *it++;
     if (it != end && *it == c) {
-      out = detail::write(
-          out, basic_string_view<Char>(start, detail::to_unsigned(it - start)));
+      write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
       start = ++it;
       continue;
     }
-    out = detail::write(out, basic_string_view<Char>(
-                                 start, detail::to_unsigned(it - 1 - start)));
+    write(out, basic_string_view<Char>(start, to_unsigned(it - 1 - start)));
 
-    basic_format_specs<Char> specs;
+    auto specs = format_specs();
     specs.align = align::right;
 
     // Parse argument index, flags and width.
     int arg_index = parse_header(it, end, specs, get_arg);
-    if (arg_index == 0) parse_ctx.on_error("argument not found");
+    if (arg_index == 0) report_error("argument not found");
 
     // Parse precision.
     if (it != end && *it == '.') {
@@ -416,8 +458,8 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
         specs.precision = parse_nonnegative_int(it, end, 0);
       } else if (c == '*') {
         ++it;
-        specs.precision = static_cast<int>(
-            visit_format_arg(detail::printf_precision_handler(), get_arg(-1)));
+        specs.precision =
+            static_cast<int>(get_arg(-1).visit(printf_precision_handler()));
       } else {
         specs.precision = 0;
       }
@@ -426,32 +468,30 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
     auto arg = get_arg(arg_index);
     // For d, i, o, u, x, and X conversion specifiers, if a precision is
     // specified, the '0' flag is ignored
-    if (specs.precision >= 0 && arg.is_integral())
-      specs.fill[0] =
-          ' ';  // Ignore '0' flag for non-numeric types or if '-' present.
-    if (specs.precision >= 0 && arg.type() == detail::type::cstring_type) {
-      auto str = visit_format_arg(detail::get_cstring<Char>(), arg);
+    if (specs.precision >= 0 && arg.is_integral()) {
+      // Ignore '0' for non-numeric types or if '-' present.
+      specs.fill = ' ';
+    }
+    if (specs.precision >= 0 && arg.type() == type::cstring_type) {
+      auto str = arg.visit(get_cstring<Char>());
       auto str_end = str + specs.precision;
       auto nul = std::find(str, str_end, Char());
-      arg = detail::make_arg<basic_printf_context<OutputIt, Char>>(
-          basic_string_view<Char>(
-              str, detail::to_unsigned(nul != str_end ? nul - str
-                                                      : specs.precision)));
+      auto sv = basic_string_view<Char>(
+          str, to_unsigned(nul != str_end ? nul - str : specs.precision));
+      arg = make_arg<basic_printf_context<Char>>(sv);
     }
-    if (specs.alt && visit_format_arg(detail::is_zero_int(), arg))
-      specs.alt = false;
-    if (specs.fill[0] == '0') {
+    if (specs.alt && arg.visit(is_zero_int())) specs.alt = false;
+    if (specs.fill.template get<Char>() == '0') {
       if (arg.is_arithmetic() && specs.align != align::left)
         specs.align = align::numeric;
       else
-        specs.fill[0] = ' ';  // Ignore '0' flag for non-numeric types or if '-'
-                              // flag is also present.
+        specs.fill = ' ';  // Ignore '0' flag for non-numeric types or if '-'
+                           // flag is also present.
     }
 
     // Parse length and convert the argument to the required type.
     c = it != end ? *it++ : 0;
     Char t = it != end ? *it : 0;
-    using detail::convert_arg;
     switch (c) {
     case 'h':
       if (t == 'h') {
@@ -490,7 +530,7 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
     }
 
     // Parse type.
-    if (it == end) FMT_THROW(format_error("invalid format string"));
+    if (it == end) report_error("invalid format string");
     char type = static_cast<char>(*it++);
     if (arg.is_integral()) {
       // Normalize type.
@@ -500,141 +540,117 @@ void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
         type = 'd';
         break;
       case 'c':
-        visit_format_arg(
-            detail::char_converter<basic_printf_context<OutputIt, Char>>(arg),
-            arg);
+        arg.visit(char_converter<basic_printf_context<Char>>(arg));
         break;
       }
     }
-    specs.type = parse_presentation_type(type);
+    bool upper = false;
+    specs.type = parse_printf_presentation_type(type, arg.type(), upper);
     if (specs.type == presentation_type::none)
-      parse_ctx.on_error("invalid type specifier");
+      report_error("invalid format specifier");
+    specs.upper = upper;
 
     start = it;
 
     // Format argument.
-    out = visit_format_arg(
-        detail::printf_arg_formatter<OutputIt, Char>(out, specs, context), arg);
+    arg.visit(printf_arg_formatter<Char>(out, specs, context));
   }
-  detail::write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
+  write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
 }
-FMT_END_DETAIL_NAMESPACE
+}  // namespace detail
 
-template <typename Char>
-using basic_printf_context_t =
-    basic_printf_context<detail::buffer_appender<Char>, Char>;
-
-using printf_context = basic_printf_context_t<char>;
-using wprintf_context = basic_printf_context_t<wchar_t>;
+using printf_context = basic_printf_context<char>;
+using wprintf_context = basic_printf_context<wchar_t>;
 
 using printf_args = basic_format_args<printf_context>;
 using wprintf_args = basic_format_args<wprintf_context>;
 
-/**
-  \rst
-  Constructs an `~fmt::format_arg_store` object that contains references to
-  arguments and can be implicitly converted to `~fmt::printf_args`.
-  \endrst
- */
-template <typename... T>
-inline auto make_printf_args(const T&... args)
-    -> format_arg_store<printf_context, T...> {
-  return {args...};
+/// Constructs an `format_arg_store` object that contains references to
+/// arguments and can be implicitly converted to `printf_args`.
+template <typename Char = char, typename... T>
+inline auto make_printf_args(T&... args)
+    -> decltype(fmt::make_format_args<basic_printf_context<Char>>(args...)) {
+  return fmt::make_format_args<basic_printf_context<Char>>(args...);
 }
 
-/**
-  \rst
-  Constructs an `~fmt::format_arg_store` object that contains references to
-  arguments and can be implicitly converted to `~fmt::wprintf_args`.
-  \endrst
- */
-template <typename... T>
-inline auto make_wprintf_args(const T&... args)
-    -> format_arg_store<wprintf_context, T...> {
-  return {args...};
-}
+template <typename Char> struct vprintf_args {
+  using type = basic_format_args<basic_printf_context<Char>>;
+};
 
-template <typename S, typename Char = char_t<S>>
-inline auto vsprintf(
-    const S& fmt,
-    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args)
+template <typename Char>
+inline auto vsprintf(basic_string_view<Char> fmt,
+                     typename vprintf_args<Char>::type args)
     -> std::basic_string<Char> {
-  basic_memory_buffer<Char> buffer;
-  vprintf(buffer, detail::to_string_view(fmt), args);
-  return to_string(buffer);
+  auto buf = basic_memory_buffer<Char>();
+  detail::vprintf(buf, fmt, args);
+  return to_string(buf);
 }
 
 /**
-  \rst
-  Formats arguments and returns the result as a string.
-
-  **Example**::
-
-    std::string message = fmt::sprintf("The answer is %d", 42);
-  \endrst
-*/
-template <typename S, typename... T,
-          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
+ * Formats `args` according to specifications in `fmt` and returns the result
+ * as as string.
+ *
+ * **Example**:
+ *
+ *     std::string message = fmt::sprintf("The answer is %d", 42);
+ */
+template <typename S, typename... T, typename Char = char_t<S>>
 inline auto sprintf(const S& fmt, const T&... args) -> std::basic_string<Char> {
-  using context = basic_printf_context_t<Char>;
   return vsprintf(detail::to_string_view(fmt),
-                  fmt::make_format_args<context>(args...));
+                  fmt::make_format_args<basic_printf_context<Char>>(args...));
 }
 
-template <typename S, typename Char = char_t<S>>
-inline auto vfprintf(
-    std::FILE* f, const S& fmt,
-    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args)
-    -> int {
-  basic_memory_buffer<Char> buffer;
-  vprintf(buffer, detail::to_string_view(fmt), args);
-  size_t size = buffer.size();
-  return std::fwrite(buffer.data(), sizeof(Char), size, f) < size
+template <typename Char>
+inline auto vfprintf(std::FILE* f, basic_string_view<Char> fmt,
+                     typename vprintf_args<Char>::type args) -> int {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vprintf(buf, fmt, args);
+  size_t size = buf.size();
+  return std::fwrite(buf.data(), sizeof(Char), size, f) < size
              ? -1
              : static_cast<int>(size);
 }
 
 /**
-  \rst
-  Prints formatted data to the file *f*.
-
-  **Example**::
-
-    fmt::fprintf(stderr, "Don't %s!", "panic");
-  \endrst
+ * Formats `args` according to specifications in `fmt` and writes the output
+ * to `f`.
+ *
+ * **Example**:
+ *
+ *     fmt::fprintf(stderr, "Don't %s!", "panic");
  */
 template <typename S, typename... T, typename Char = char_t<S>>
 inline auto fprintf(std::FILE* f, const S& fmt, const T&... args) -> int {
-  using context = basic_printf_context_t<Char>;
   return vfprintf(f, detail::to_string_view(fmt),
-                  fmt::make_format_args<context>(args...));
+                  make_printf_args<Char>(args...));
 }
 
-template <typename S, typename Char = char_t<S>>
-inline auto vprintf(
-    const S& fmt,
-    basic_format_args<basic_printf_context_t<type_identity_t<Char>>> args)
+template <typename Char>
+FMT_DEPRECATED inline auto vprintf(basic_string_view<Char> fmt,
+                                   typename vprintf_args<Char>::type args)
     -> int {
-  return vfprintf(stdout, detail::to_string_view(fmt), args);
+  return vfprintf(stdout, fmt, args);
 }
 
 /**
-  \rst
-  Prints formatted data to ``stdout``.
-
-  **Example**::
-
-    fmt::printf("Elapsed time: %.2f seconds", 1.23);
-  \endrst
+ * Formats `args` according to specifications in `fmt` and writes the output
+ * to `stdout`.
+ *
+ * **Example**:
+ *
+ *   fmt::printf("Elapsed time: %.2f seconds", 1.23);
  */
-template <typename S, typename... T, FMT_ENABLE_IF(detail::is_string<S>::value)>
-inline auto printf(const S& fmt, const T&... args) -> int {
-  return vprintf(
-      detail::to_string_view(fmt),
-      fmt::make_format_args<basic_printf_context_t<char_t<S>>>(args...));
+template <typename... T>
+inline auto printf(string_view fmt, const T&... args) -> int {
+  return vfprintf(stdout, fmt, make_printf_args(args...));
+}
+template <typename... T>
+FMT_DEPRECATED inline auto printf(basic_string_view<wchar_t> fmt,
+                                  const T&... args) -> int {
+  return vfprintf(stdout, fmt, make_printf_args<wchar_t>(args...));
 }
 
-FMT_MODULE_EXPORT_END
+FMT_END_EXPORT
 FMT_END_NAMESPACE
 
 #endif  // FMT_PRINTF_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/ranges.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/ranges.h
index 2105a668822c..0d3dfbd8d378 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/ranges.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/ranges.h
@@ -1,80 +1,38 @@
-// Formatting library for C++ - experimental range support
+// Formatting library for C++ - range and tuple support
 //
-// Copyright (c) 2012 - present, Victor Zverovich
+// Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
 // All rights reserved.
 //
 // For the license information refer to format.h.
-//
-// Copyright (c) 2018 - present, Remotion (Igor Schulz)
-// All Rights Reserved
-// {fmt} support for ranges, containers and types tuple interface.
 
 #ifndef FMT_RANGES_H_
 #define FMT_RANGES_H_
 
-#include <initializer_list>
-#include <tuple>
-#include <type_traits>
+#ifndef FMT_MODULE
+#  include <initializer_list>
+#  include <iterator>
+#  include <string>
+#  include <tuple>
+#  include <type_traits>
+#  include <utility>
+#endif
 
 #include "format.h"
 
 FMT_BEGIN_NAMESPACE
 
-namespace detail {
-
-template <typename RangeT, typename OutputIterator>
-OutputIterator copy(const RangeT& range, OutputIterator out) {
-  for (auto it = range.begin(), end = range.end(); it != end; ++it)
-    *out++ = *it;
-  return out;
-}
-
-template <typename OutputIterator>
-OutputIterator copy(const char* str, OutputIterator out) {
-  while (*str) *out++ = *str++;
-  return out;
-}
-
-template <typename OutputIterator>
-OutputIterator copy(char ch, OutputIterator out) {
-  *out++ = ch;
-  return out;
-}
-
-template <typename OutputIterator>
-OutputIterator copy(wchar_t ch, OutputIterator out) {
-  *out++ = ch;
-  return out;
-}
-
-// Returns true if T has a std::string-like interface, like std::string_view.
-template <typename T> class is_std_string_like {
-  template <typename U>
-  static auto check(U* p)
-      -> decltype((void)p->find('a'), p->length(), (void)p->data(), int());
-  template <typename> static void check(...);
-
- public:
-  static constexpr const bool value =
-      is_string<T>::value ||
-      std::is_convertible<T, std_string_view<char>>::value ||
-      !std::is_void<decltype(check<T>(nullptr))>::value;
-};
+FMT_EXPORT
+enum class range_format { disabled, map, set, sequence, string, debug_string };
 
-template <typename Char>
-struct is_std_string_like<fmt::basic_string_view<Char>> : std::true_type {};
+namespace detail {
 
 template <typename T> class is_map {
   template <typename U> static auto check(U*) -> typename U::mapped_type;
   template <typename> static void check(...);
 
  public:
-#ifdef FMT_FORMAT_MAP_AS_LIST
-  static constexpr const bool value = false;
-#else
   static constexpr const bool value =
       !std::is_void<decltype(check<T>(nullptr))>::value;
-#endif
 };
 
 template <typename T> class is_set {
@@ -82,12 +40,8 @@ template <typename T> class is_set {
   template <typename> static void check(...);
 
  public:
-#ifdef FMT_FORMAT_SET_AS_LIST
-  static constexpr const bool value = false;
-#else
   static constexpr const bool value =
       !std::is_void<decltype(check<T>(nullptr))>::value && !is_map<T>::value;
-#endif
 };
 
 template <typename... Ts> struct conditional_helper {};
@@ -116,17 +70,17 @@ template <typename T, typename Enable = void>
 struct has_member_fn_begin_end_t : std::false_type {};
 
 template <typename T>
-struct has_member_fn_begin_end_t<T, void_t<decltype(std::declval<T>().begin()),
+struct has_member_fn_begin_end_t<T, void_t<decltype(*std::declval<T>().begin()),
                                            decltype(std::declval<T>().end())>>
     : std::true_type {};
 
-// Member function overload
+// Member function overloads.
 template <typename T>
 auto range_begin(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).begin());
 template <typename T>
 auto range_end(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).end());
 
-// ADL overload. Only participates in overload resolution if member functions
+// ADL overloads. Only participate in overload resolution if member functions
 // are not found.
 template <typename T>
 auto range_begin(T&& rng)
@@ -147,18 +101,19 @@ struct has_mutable_begin_end : std::false_type {};
 
 template <typename T>
 struct has_const_begin_end<
-    T,
-    void_t<
-        decltype(detail::range_begin(std::declval<const remove_cvref_t<T>&>())),
-        decltype(detail::range_end(std::declval<const remove_cvref_t<T>&>()))>>
+    T, void_t<decltype(*detail::range_begin(
+                  std::declval<const remove_cvref_t<T>&>())),
+              decltype(detail::range_end(
+                  std::declval<const remove_cvref_t<T>&>()))>>
     : std::true_type {};
 
 template <typename T>
 struct has_mutable_begin_end<
-    T, void_t<decltype(detail::range_begin(std::declval<T>())),
-              decltype(detail::range_end(std::declval<T>())),
-              enable_if_t<std::is_copy_constructible<T>::value>>>
-    : std::true_type {};
+    T, void_t<decltype(*detail::range_begin(std::declval<T&>())),
+              decltype(detail::range_end(std::declval<T&>())),
+              // the extra int here is because older versions of MSVC don't
+              // SFINAE properly unless there are distinct types
+              int>> : std::true_type {};
 
 template <typename T>
 struct is_range_<T, void>
@@ -188,7 +143,7 @@ template <size_t N> using make_index_sequence = std::make_index_sequence<N>;
 template <typename T, T... N> struct integer_sequence {
   using value_type = T;
 
-  static FMT_CONSTEXPR size_t size() { return sizeof...(N); }
+  static FMT_CONSTEXPR auto size() -> size_t { return sizeof...(N); }
 };
 
 template <size_t... N> using index_sequence = integer_sequence<size_t, N...>;
@@ -211,41 +166,62 @@ class is_tuple_formattable_ {
   static constexpr const bool value = false;
 };
 template <typename T, typename C> class is_tuple_formattable_<T, C, true> {
-  template <std::size_t... Is>
-  static std::true_type check2(index_sequence<Is...>,
-                               integer_sequence<bool, (Is == Is)...>);
-  static std::false_type check2(...);
-  template <std::size_t... Is>
-  static decltype(check2(
+  template <size_t... Is>
+  static auto all_true(index_sequence<Is...>,
+                       integer_sequence<bool, (Is >= 0)...>) -> std::true_type;
+  static auto all_true(...) -> std::false_type;
+
+  template <size_t... Is>
+  static auto check(index_sequence<Is...>) -> decltype(all_true(
       index_sequence<Is...>{},
-      integer_sequence<
-          bool, (is_formattable<typename std::tuple_element<Is, T>::type,
-                                C>::value)...>{})) check(index_sequence<Is...>);
+      integer_sequence<bool,
+                       (is_formattable<typename std::tuple_element<Is, T>::type,
+                                       C>::value)...>{}));
 
  public:
   static constexpr const bool value =
       decltype(check(tuple_index_sequence<T>{}))::value;
 };
 
-template <class Tuple, class F, size_t... Is>
-void for_each(index_sequence<Is...>, Tuple&& tup, F&& f) noexcept {
+template <typename Tuple, typename F, size_t... Is>
+FMT_CONSTEXPR void for_each(index_sequence<Is...>, Tuple&& t, F&& f) {
   using std::get;
-  // using free function get<I>(T) now.
-  const int _[] = {0, ((void)f(get<Is>(tup)), 0)...};
-  (void)_;  // blocks warnings
+  // Using a free function get<Is>(Tuple) now.
+  const int unused[] = {0, ((void)f(get<Is>(t)), 0)...};
+  ignore_unused(unused);
 }
 
-template <class T>
-FMT_CONSTEXPR make_index_sequence<std::tuple_size<T>::value> get_indexes(
-    T const&) {
-  return {};
+template <typename Tuple, typename F>
+FMT_CONSTEXPR void for_each(Tuple&& t, F&& f) {
+  for_each(tuple_index_sequence<remove_cvref_t<Tuple>>(),
+           std::forward<Tuple>(t), std::forward<F>(f));
 }
 
-template <class Tuple, class F> void for_each(Tuple&& tup, F&& f) {
-  const auto indexes = get_indexes(tup);
-  for_each(indexes, std::forward<Tuple>(tup), std::forward<F>(f));
+template <typename Tuple1, typename Tuple2, typename F, size_t... Is>
+void for_each2(index_sequence<Is...>, Tuple1&& t1, Tuple2&& t2, F&& f) {
+  using std::get;
+  const int unused[] = {0, ((void)f(get<Is>(t1), get<Is>(t2)), 0)...};
+  ignore_unused(unused);
 }
 
+template <typename Tuple1, typename Tuple2, typename F>
+void for_each2(Tuple1&& t1, Tuple2&& t2, F&& f) {
+  for_each2(tuple_index_sequence<remove_cvref_t<Tuple1>>(),
+            std::forward<Tuple1>(t1), std::forward<Tuple2>(t2),
+            std::forward<F>(f));
+}
+
+namespace tuple {
+// Workaround a bug in MSVC 2019 (v140).
+template <typename Char, typename... T>
+using result_t = std::tuple<formatter<remove_cvref_t<T>, Char>...>;
+
+using std::get;
+template <typename Tuple, typename Char, std::size_t... Is>
+auto get_formatters(index_sequence<Is...>)
+    -> result_t<Char, decltype(get<Is>(std::declval<Tuple>()))...>;
+}  // namespace tuple
+
 #if FMT_MSC_VERSION && FMT_MSC_VERSION < 1920
 // Older MSVC doesn't get the reference type correctly for arrays.
 template <typename R> struct range_reference_type_impl {
@@ -269,45 +245,48 @@ using range_reference_type =
 template <typename Range>
 using uncvref_type = remove_cvref_t<range_reference_type<Range>>;
 
-template <typename Range>
-using uncvref_first_type =
-    remove_cvref_t<decltype(std::declval<range_reference_type<Range>>().first)>;
-
-template <typename Range>
-using uncvref_second_type = remove_cvref_t<
-    decltype(std::declval<range_reference_type<Range>>().second)>;
-
-template <typename OutputIt> OutputIt write_delimiter(OutputIt out) {
-  *out++ = ',';
-  *out++ = ' ';
-  return out;
+template <typename Formatter>
+FMT_CONSTEXPR auto maybe_set_debug_format(Formatter& f, bool set)
+    -> decltype(f.set_debug_format(set)) {
+  f.set_debug_format(set);
 }
+template <typename Formatter>
+FMT_CONSTEXPR void maybe_set_debug_format(Formatter&, ...) {}
 
-template <typename Char, typename OutputIt>
-auto write_range_entry(OutputIt out, basic_string_view<Char> str) -> OutputIt {
-  return write_escaped_string(out, str);
-}
+template <typename T>
+struct range_format_kind_
+    : std::integral_constant<range_format,
+                             std::is_same<uncvref_type<T>, T>::value
+                                 ? range_format::disabled
+                             : is_map<T>::value ? range_format::map
+                             : is_set<T>::value ? range_format::set
+                                                : range_format::sequence> {};
 
-template <typename Char, typename OutputIt, typename T,
-          FMT_ENABLE_IF(std::is_convertible<T, std_string_view<char>>::value)>
-inline auto write_range_entry(OutputIt out, const T& str) -> OutputIt {
-  auto sv = std_string_view<Char>(str);
-  return write_range_entry<Char>(out, basic_string_view<Char>(sv));
-}
+template <range_format K>
+using range_format_constant = std::integral_constant<range_format, K>;
 
-template <typename Char, typename OutputIt, typename Arg,
-          FMT_ENABLE_IF(std::is_same<Arg, Char>::value)>
-OutputIt write_range_entry(OutputIt out, const Arg v) {
-  return write_escaped_char(out, v);
-}
+// These are not generic lambdas for compatibility with C++11.
+template <typename ParseContext> struct parse_empty_specs {
+  template <typename Formatter> FMT_CONSTEXPR void operator()(Formatter& f) {
+    f.parse(ctx);
+    detail::maybe_set_debug_format(f, true);
+  }
+  ParseContext& ctx;
+};
+template <typename FormatContext> struct format_tuple_element {
+  using char_type = typename FormatContext::char_type;
+
+  template <typename T>
+  void operator()(const formatter<T, char_type>& f, const T& v) {
+    if (i > 0) ctx.advance_to(detail::copy<char_type>(separator, ctx.out()));
+    ctx.advance_to(f.format(v, ctx));
+    ++i;
+  }
 
-template <
-    typename Char, typename OutputIt, typename Arg,
-    FMT_ENABLE_IF(!is_std_string_like<typename std::decay<Arg>::type>::value &&
-                  !std::is_same<Arg, Char>::value)>
-OutputIt write_range_entry(OutputIt out, const Arg& v) {
-  return write<Char>(out, v);
-}
+  int i;
+  FormatContext& ctx;
+  basic_string_view<char_type> separator;
+};
 
 }  // namespace detail
 
@@ -321,29 +300,20 @@ template <typename T, typename C> struct is_tuple_formattable {
       detail::is_tuple_formattable_<T, C>::value;
 };
 
-template <typename TupleT, typename Char>
-struct formatter<TupleT, Char,
-                 enable_if_t<fmt::is_tuple_like<TupleT>::value &&
-                             fmt::is_tuple_formattable<TupleT, Char>::value>> {
+template <typename Tuple, typename Char>
+struct formatter<Tuple, Char,
+                 enable_if_t<fmt::is_tuple_like<Tuple>::value &&
+                             fmt::is_tuple_formattable<Tuple, Char>::value>> {
  private:
+  decltype(detail::tuple::get_formatters<Tuple, Char>(
+      detail::tuple_index_sequence<Tuple>())) formatters_;
+
   basic_string_view<Char> separator_ = detail::string_literal<Char, ',', ' '>{};
   basic_string_view<Char> opening_bracket_ =
       detail::string_literal<Char, '('>{};
   basic_string_view<Char> closing_bracket_ =
       detail::string_literal<Char, ')'>{};
 
-  // C++11 generic lambda for format().
-  template <typename FormatContext> struct format_each {
-    template <typename T> void operator()(const T& v) {
-      if (i > 0) out = detail::copy_str<Char>(separator, out);
-      out = detail::write_range_entry<Char>(out, v);
-      ++i;
-    }
-    int i;
-    typename FormatContext::iterator& out;
-    basic_string_view<Char> separator;
-  };
-
  public:
   FMT_CONSTEXPR formatter() {}
 
@@ -359,25 +329,26 @@ struct formatter<TupleT, Char,
 
   template <typename ParseContext>
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
-    return ctx.begin();
+    auto it = ctx.begin();
+    if (it != ctx.end() && *it != '}') report_error("invalid format specifier");
+    detail::for_each(formatters_, detail::parse_empty_specs<ParseContext>{ctx});
+    return it;
   }
 
-  template <typename FormatContext = format_context>
-  auto format(const TupleT& values, FormatContext& ctx) const
+  template <typename FormatContext>
+  auto format(const Tuple& value, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    auto out = ctx.out();
-    out = detail::copy_str<Char>(opening_bracket_, out);
-    detail::for_each(values, format_each<FormatContext>{0, out, separator_});
-    out = detail::copy_str<Char>(closing_bracket_, out);
-    return out;
+    ctx.advance_to(detail::copy<Char>(opening_bracket_, ctx.out()));
+    detail::for_each2(
+        formatters_, value,
+        detail::format_tuple_element<FormatContext>{0, ctx, separator_});
+    return detail::copy<Char>(closing_bracket_, ctx.out());
   }
 };
 
 template <typename T, typename Char> struct is_range {
   static constexpr const bool value =
-      detail::is_range_<T>::value && !detail::is_std_string_like<T>::value &&
-      !std::is_convertible<T, std::basic_string<Char>>::value &&
-      !std::is_convertible<T, detail::std_string_view<Char>>::value;
+      detail::is_range_<T>::value && !detail::has_to_string_view<T>::value;
 };
 
 namespace detail {
@@ -398,12 +369,10 @@ template <typename Context> struct range_mapper {
 };
 
 template <typename Char, typename Element>
-using range_formatter_type = conditional_t<
-    is_formattable<Element, Char>::value,
-    formatter<remove_cvref_t<decltype(range_mapper<buffer_context<Char>>{}.map(
-                  std::declval<Element>()))>,
-              Char>,
-    fallback_formatter<Element, Char>>;
+using range_formatter_type =
+    formatter<remove_cvref_t<decltype(range_mapper<buffered_context<Char>>{}
+                                          .map(std::declval<Element>()))>,
+              Char>;
 
 template <typename R>
 using maybe_const_range =
@@ -413,43 +382,48 @@ using maybe_const_range =
 #if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
 template <typename R, typename Char>
 struct is_formattable_delayed
-    : disjunction<
-          is_formattable<uncvref_type<maybe_const_range<R>>, Char>,
-          has_fallback_formatter<uncvref_type<maybe_const_range<R>>, Char>> {};
+    : is_formattable<uncvref_type<maybe_const_range<R>>, Char> {};
 #endif
-
 }  // namespace detail
 
+template <typename...> struct conjunction : std::true_type {};
+template <typename P> struct conjunction<P> : P {};
+template <typename P1, typename... Pn>
+struct conjunction<P1, Pn...>
+    : conditional_t<bool(P1::value), conjunction<Pn...>, P1> {};
+
 template <typename T, typename Char, typename Enable = void>
 struct range_formatter;
 
 template <typename T, typename Char>
 struct range_formatter<
     T, Char,
-    enable_if_t<conjunction<
-        std::is_same<T, remove_cvref_t<T>>,
-        disjunction<is_formattable<T, Char>,
-                    detail::has_fallback_formatter<T, Char>>>::value>> {
+    enable_if_t<conjunction<std::is_same<T, remove_cvref_t<T>>,
+                            is_formattable<T, Char>>::value>> {
  private:
   detail::range_formatter_type<Char, T> underlying_;
-  bool custom_specs_ = false;
   basic_string_view<Char> separator_ = detail::string_literal<Char, ',', ' '>{};
   basic_string_view<Char> opening_bracket_ =
       detail::string_literal<Char, '['>{};
   basic_string_view<Char> closing_bracket_ =
       detail::string_literal<Char, ']'>{};
-
-  template <class U>
-  FMT_CONSTEXPR static auto maybe_set_debug_format(U& u, int)
-      -> decltype(u.set_debug_format()) {
-    u.set_debug_format();
+  bool is_debug = false;
+
+  template <typename Output, typename It, typename Sentinel, typename U = T,
+            FMT_ENABLE_IF(std::is_same<U, Char>::value)>
+  auto write_debug_string(Output& out, It it, Sentinel end) const -> Output {
+    auto buf = basic_memory_buffer<Char>();
+    for (; it != end; ++it) buf.push_back(*it);
+    auto specs = format_specs();
+    specs.type = presentation_type::debug;
+    return detail::write<Char>(
+        out, basic_string_view<Char>(buf.data(), buf.size()), specs);
   }
 
-  template <class U>
-  FMT_CONSTEXPR static void maybe_set_debug_format(U&, ...) {}
-
-  FMT_CONSTEXPR void maybe_set_debug_format() {
-    maybe_set_debug_format(underlying_, 0);
+  template <typename Output, typename It, typename Sentinel, typename U = T,
+            FMT_ENABLE_IF(!std::is_same<U, Char>::value)>
+  auto write_debug_string(Output& out, It, Sentinel) const -> Output {
+    return out;
   }
 
  public:
@@ -473,92 +447,190 @@ struct range_formatter<
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     auto it = ctx.begin();
     auto end = ctx.end();
-    if (it == end || *it == '}') {
-      maybe_set_debug_format();
-      return it;
-    }
+    detail::maybe_set_debug_format(underlying_, true);
+    if (it == end) return underlying_.parse(ctx);
 
-    if (*it == 'n') {
+    switch (detail::to_ascii(*it)) {
+    case 'n':
       set_brackets({}, {});
       ++it;
-    }
-
-    if (*it == '}') {
-      maybe_set_debug_format();
+      break;
+    case '?':
+      is_debug = true;
+      set_brackets({}, {});
+      ++it;
+      if (it == end || *it != 's') report_error("invalid format specifier");
+      FMT_FALLTHROUGH;
+    case 's':
+      if (!std::is_same<T, Char>::value)
+        report_error("invalid format specifier");
+      if (!is_debug) {
+        set_brackets(detail::string_literal<Char, '"'>{},
+                     detail::string_literal<Char, '"'>{});
+        set_separator({});
+        detail::maybe_set_debug_format(underlying_, false);
+      }
+      ++it;
       return it;
     }
 
-    if (*it != ':')
-      FMT_THROW(format_error("no other top-level range formatters supported"));
+    if (it != end && *it != '}') {
+      if (*it != ':') report_error("invalid format specifier");
+      detail::maybe_set_debug_format(underlying_, false);
+      ++it;
+    }
 
-    custom_specs_ = true;
-    ++it;
     ctx.advance_to(it);
     return underlying_.parse(ctx);
   }
 
-  template <typename R, class FormatContext>
+  template <typename R, typename FormatContext>
   auto format(R&& range, FormatContext& ctx) const -> decltype(ctx.out()) {
-    detail::range_mapper<buffer_context<Char>> mapper;
+    auto mapper = detail::range_mapper<buffered_context<Char>>();
     auto out = ctx.out();
-    out = detail::copy_str<Char>(opening_bracket_, out);
-    int i = 0;
     auto it = detail::range_begin(range);
     auto end = detail::range_end(range);
+    if (is_debug) return write_debug_string(out, std::move(it), end);
+
+    out = detail::copy<Char>(opening_bracket_, out);
+    int i = 0;
     for (; it != end; ++it) {
-      if (i > 0) out = detail::copy_str<Char>(separator_, out);
-      ;
+      if (i > 0) out = detail::copy<Char>(separator_, out);
       ctx.advance_to(out);
-      out = underlying_.format(mapper.map(*it), ctx);
+      auto&& item = *it;  // Need an lvalue
+      out = underlying_.format(mapper.map(item), ctx);
       ++i;
     }
-    out = detail::copy_str<Char>(closing_bracket_, out);
+    out = detail::copy<Char>(closing_bracket_, out);
     return out;
   }
 };
 
-enum class range_format { disabled, map, set, sequence, string, debug_string };
+FMT_EXPORT
+template <typename T, typename Char, typename Enable = void>
+struct range_format_kind
+    : conditional_t<
+          is_range<T, Char>::value, detail::range_format_kind_<T>,
+          std::integral_constant<range_format, range_format::disabled>> {};
 
-namespace detail {
-template <typename T> struct range_format_kind_ {
-  static constexpr auto value = std::is_same<range_reference_type<T>, T>::value
-                                    ? range_format::disabled
-                                : is_map<T>::value ? range_format::map
-                                : is_set<T>::value ? range_format::set
-                                                   : range_format::sequence;
-};
+template <typename R, typename Char>
+struct formatter<
+    R, Char,
+    enable_if_t<conjunction<
+        bool_constant<
+            range_format_kind<R, Char>::value != range_format::disabled &&
+            range_format_kind<R, Char>::value != range_format::map &&
+            range_format_kind<R, Char>::value != range_format::string &&
+            range_format_kind<R, Char>::value != range_format::debug_string>
+// Workaround a bug in MSVC 2015 and earlier.
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
+        ,
+        detail::is_formattable_delayed<R, Char>
+#endif
+        >::value>> {
+ private:
+  using range_type = detail::maybe_const_range<R>;
+  range_formatter<detail::uncvref_type<range_type>, Char> range_formatter_;
 
-template <range_format K, typename R, typename Char, typename Enable = void>
-struct range_default_formatter;
+ public:
+  using nonlocking = void;
+
+  FMT_CONSTEXPR formatter() {
+    if (detail::const_check(range_format_kind<R, Char>::value !=
+                            range_format::set))
+      return;
+    range_formatter_.set_brackets(detail::string_literal<Char, '{'>{},
+                                  detail::string_literal<Char, '}'>{});
+  }
 
-template <range_format K>
-using range_format_constant = std::integral_constant<range_format, K>;
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return range_formatter_.parse(ctx);
+  }
 
-template <range_format K, typename R, typename Char>
-struct range_default_formatter<
-    K, R, Char,
-    enable_if_t<(K == range_format::sequence || K == range_format::map ||
-                 K == range_format::set)>> {
-  using range_type = detail::maybe_const_range<R>;
-  range_formatter<detail::uncvref_type<range_type>, Char> underlying_;
+  template <typename FormatContext>
+  auto format(range_type& range, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return range_formatter_.format(range, ctx);
+  }
+};
 
-  FMT_CONSTEXPR range_default_formatter() { init(range_format_constant<K>()); }
+// A map formatter.
+template <typename R, typename Char>
+struct formatter<
+    R, Char,
+    enable_if_t<range_format_kind<R, Char>::value == range_format::map>> {
+ private:
+  using map_type = detail::maybe_const_range<R>;
+  using element_type = detail::uncvref_type<map_type>;
+
+  decltype(detail::tuple::get_formatters<element_type, Char>(
+      detail::tuple_index_sequence<element_type>())) formatters_;
+  bool no_delimiters_ = false;
+
+ public:
+  FMT_CONSTEXPR formatter() {}
 
-  FMT_CONSTEXPR void init(range_format_constant<range_format::set>) {
-    underlying_.set_brackets(detail::string_literal<Char, '{'>{},
-                             detail::string_literal<Char, '}'>{});
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    auto it = ctx.begin();
+    auto end = ctx.end();
+    if (it != end) {
+      if (detail::to_ascii(*it) == 'n') {
+        no_delimiters_ = true;
+        ++it;
+      }
+      if (it != end && *it != '}') {
+        if (*it != ':') report_error("invalid format specifier");
+        ++it;
+      }
+      ctx.advance_to(it);
+    }
+    detail::for_each(formatters_, detail::parse_empty_specs<ParseContext>{ctx});
+    return it;
   }
 
-  FMT_CONSTEXPR void init(range_format_constant<range_format::map>) {
-    underlying_.set_brackets(detail::string_literal<Char, '{'>{},
-                             detail::string_literal<Char, '}'>{});
-    underlying_.underlying().set_brackets({}, {});
-    underlying_.underlying().set_separator(
-        detail::string_literal<Char, ':', ' '>{});
+  template <typename FormatContext>
+  auto format(map_type& map, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    basic_string_view<Char> open = detail::string_literal<Char, '{'>{};
+    if (!no_delimiters_) out = detail::copy<Char>(open, out);
+    int i = 0;
+    auto mapper = detail::range_mapper<buffered_context<Char>>();
+    basic_string_view<Char> sep = detail::string_literal<Char, ',', ' '>{};
+    for (auto&& value : map) {
+      if (i > 0) out = detail::copy<Char>(sep, out);
+      ctx.advance_to(out);
+      detail::for_each2(formatters_, mapper.map(value),
+                        detail::format_tuple_element<FormatContext>{
+                            0, ctx, detail::string_literal<Char, ':', ' '>{}});
+      ++i;
+    }
+    basic_string_view<Char> close = detail::string_literal<Char, '}'>{};
+    if (!no_delimiters_) out = detail::copy<Char>(close, out);
+    return out;
   }
+};
+
+// A (debug_)string formatter.
+template <typename R, typename Char>
+struct formatter<
+    R, Char,
+    enable_if_t<range_format_kind<R, Char>::value == range_format::string ||
+                range_format_kind<R, Char>::value ==
+                    range_format::debug_string>> {
+ private:
+  using range_type = detail::maybe_const_range<R>;
+  using string_type =
+      conditional_t<std::is_constructible<
+                        detail::std_string_view<Char>,
+                        decltype(detail::range_begin(std::declval<R>())),
+                        decltype(detail::range_end(std::declval<R>()))>::value,
+                    detail::std_string_view<Char>, std::basic_string<Char>>;
 
-  FMT_CONSTEXPR void init(range_format_constant<range_format::sequence>) {}
+  formatter<string_type, Char> underlying_;
 
+ public:
   template <typename ParseContext>
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     return underlying_.parse(ctx);
@@ -567,32 +639,98 @@ struct range_default_formatter<
   template <typename FormatContext>
   auto format(range_type& range, FormatContext& ctx) const
       -> decltype(ctx.out()) {
-    return underlying_.format(range, ctx);
+    auto out = ctx.out();
+    if (detail::const_check(range_format_kind<R, Char>::value ==
+                            range_format::debug_string))
+      *out++ = '"';
+    out = underlying_.format(
+        string_type{detail::range_begin(range), detail::range_end(range)}, ctx);
+    if (detail::const_check(range_format_kind<R, Char>::value ==
+                            range_format::debug_string))
+      *out++ = '"';
+    return out;
   }
 };
-}  // namespace detail
 
-template <typename T, typename Char, typename Enable = void>
-struct range_format_kind
-    : conditional_t<
-          is_range<T, Char>::value, detail::range_format_kind_<T>,
-          std::integral_constant<range_format, range_format::disabled>> {};
+template <typename It, typename Sentinel, typename Char = char>
+struct join_view : detail::view {
+  It begin;
+  Sentinel end;
+  basic_string_view<Char> sep;
 
-template <typename R, typename Char>
-struct formatter<
-    R, Char,
-    enable_if_t<conjunction<bool_constant<range_format_kind<R, Char>::value !=
-                                          range_format::disabled>
-// Workaround a bug in MSVC 2015 and earlier.
-#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
-                            ,
-                            detail::is_formattable_delayed<R, Char>
+  join_view(It b, Sentinel e, basic_string_view<Char> s)
+      : begin(std::move(b)), end(e), sep(s) {}
+};
+
+template <typename It, typename Sentinel, typename Char>
+struct formatter<join_view<It, Sentinel, Char>, Char> {
+ private:
+  using value_type =
+#ifdef __cpp_lib_ranges
+      std::iter_value_t<It>;
+#else
+      typename std::iterator_traits<It>::value_type;
 #endif
-                            >::value>>
-    : detail::range_default_formatter<range_format_kind<R, Char>::value, R,
-                                      Char> {
+  formatter<remove_cvref_t<value_type>, Char> value_formatter_;
+
+  using view_ref = conditional_t<std::is_copy_constructible<It>::value,
+                                 const join_view<It, Sentinel, Char>&,
+                                 join_view<It, Sentinel, Char>&&>;
+
+ public:
+  using nonlocking = void;
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
+    return value_formatter_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(view_ref& value, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto it = std::forward<view_ref>(value).begin;
+    auto out = ctx.out();
+    if (it == value.end) return out;
+    out = value_formatter_.format(*it, ctx);
+    ++it;
+    while (it != value.end) {
+      out = detail::copy<Char>(value.sep.begin(), value.sep.end(), out);
+      ctx.advance_to(out);
+      out = value_formatter_.format(*it, ctx);
+      ++it;
+    }
+    return out;
+  }
 };
 
+/// Returns a view that formats the iterator range `[begin, end)` with elements
+/// separated by `sep`.
+template <typename It, typename Sentinel>
+auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
+  return {std::move(begin), end, sep};
+}
+
+/**
+ * Returns a view that formats `range` with elements separated by `sep`.
+ *
+ * **Example**:
+ *
+ *     auto v = std::vector<int>{1, 2, 3};
+ *     fmt::print("{}", fmt::join(v, ", "));
+ *     // Output: 1, 2, 3
+ *
+ * `fmt::join` applies passed format specifiers to the range elements:
+ *
+ *     fmt::print("{:02}", fmt::join(v, ", "));
+ *     // Output: 01, 02, 03
+ */
+template <typename Range>
+auto join(Range&& r, string_view sep)
+    -> join_view<decltype(detail::range_begin(r)),
+                 decltype(detail::range_end(r))> {
+  return {detail::range_begin(r), detail::range_end(r), sep};
+}
+
 template <typename Char, typename... T> struct tuple_join_view : detail::view {
   const std::tuple<T...>& tuple;
   basic_string_view<Char> sep;
@@ -601,9 +739,6 @@ template <typename Char, typename... T> struct tuple_join_view : detail::view {
       : tuple(t), sep{s} {}
 };
 
-template <typename Char, typename... T>
-using tuple_arg_join = tuple_join_view<Char, T...>;
-
 // Define FMT_TUPLE_JOIN_SPECIFIERS to enable experimental format specifiers
 // support in tuple_join. It is disabled by default because of issues with
 // the dynamic width and precision.
@@ -645,7 +780,7 @@ struct formatter<tuple_join_view<Char, T...>, Char> {
     if (N > 1) {
       auto end1 = do_parse(ctx, std::integral_constant<size_t, N - 1>());
       if (end != end1)
-        FMT_THROW(format_error("incompatible format specs for tuple elements"));
+        report_error("incompatible format specs for tuple elements");
     }
 #endif
     return end;
@@ -664,27 +799,61 @@ struct formatter<tuple_join_view<Char, T...>, Char> {
       typename FormatContext::iterator {
     auto out = std::get<sizeof...(T) - N>(formatters_)
                    .format(std::get<sizeof...(T) - N>(value.tuple), ctx);
-    if (N > 1) {
-      out = std::copy(value.sep.begin(), value.sep.end(), out);
-      ctx.advance_to(out);
-      return do_format(value, ctx, std::integral_constant<size_t, N - 1>());
-    }
-    return out;
+    if (N <= 1) return out;
+    out = detail::copy<Char>(value.sep, out);
+    ctx.advance_to(out);
+    return do_format(value, ctx, std::integral_constant<size_t, N - 1>());
   }
 };
 
-FMT_MODULE_EXPORT_BEGIN
+namespace detail {
+// Check if T has an interface like a container adaptor (e.g. std::stack,
+// std::queue, std::priority_queue).
+template <typename T> class is_container_adaptor_like {
+  template <typename U> static auto check(U* p) -> typename U::container_type;
+  template <typename> static void check(...);
 
-/**
-  \rst
-  Returns an object that formats `tuple` with elements separated by `sep`.
+ public:
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+};
 
-  **Example**::
+template <typename Container> struct all {
+  const Container& c;
+  auto begin() const -> typename Container::const_iterator { return c.begin(); }
+  auto end() const -> typename Container::const_iterator { return c.end(); }
+};
+}  // namespace detail
 
-    std::tuple<int, char> t = {1, 'a'};
-    fmt::print("{}", fmt::join(t, ", "));
-    // Output: "1, a"
-  \endrst
+template <typename T, typename Char>
+struct formatter<
+    T, Char,
+    enable_if_t<conjunction<detail::is_container_adaptor_like<T>,
+                            bool_constant<range_format_kind<T, Char>::value ==
+                                          range_format::disabled>>::value>>
+    : formatter<detail::all<typename T::container_type>, Char> {
+  using all = detail::all<typename T::container_type>;
+  template <typename FormatContext>
+  auto format(const T& t, FormatContext& ctx) const -> decltype(ctx.out()) {
+    struct getter : T {
+      static auto get(const T& t) -> all {
+        return {t.*(&getter::c)};  // Access c through the derived class.
+      }
+    };
+    return formatter<all>::format(getter::get(t), ctx);
+  }
+};
+
+FMT_BEGIN_EXPORT
+
+/**
+ * Returns an object that formats `std::tuple` with elements separated by `sep`.
+ *
+ * **Example**:
+ *
+ *     auto t = std::tuple<int, char>{1, 'a'};
+ *     fmt::print("{}", fmt::join(t, ", "));
+ *     // Output: 1, a
  */
 template <typename... T>
 FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple, string_view sep)
@@ -692,23 +861,14 @@ FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple, string_view sep)
   return {tuple, sep};
 }
 
-template <typename... T>
-FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple,
-                        basic_string_view<wchar_t> sep)
-    -> tuple_join_view<wchar_t, T...> {
-  return {tuple, sep};
-}
-
 /**
-  \rst
-  Returns an object that formats `initializer_list` with elements separated by
-  `sep`.
-
-  **Example**::
-
-    fmt::print("{}", fmt::join({1, 2, 3}, ", "));
-    // Output: "1, 2, 3"
-  \endrst
+ * Returns an object that formats `std::initializer_list` with elements
+ * separated by `sep`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("{}", fmt::join({1, 2, 3}, ", "));
+ *     // Output: "1, 2, 3"
  */
 template <typename T>
 auto join(std::initializer_list<T> list, string_view sep)
@@ -716,7 +876,7 @@ auto join(std::initializer_list<T> list, string_view sep)
   return join(std::begin(list), std::end(list), sep);
 }
 
-FMT_MODULE_EXPORT_END
+FMT_END_EXPORT
 FMT_END_NAMESPACE
 
 #endif  // FMT_RANGES_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/std.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/std.h
index ec7abaa99f05..fb43940bc06f 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/std.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/std.h
@@ -8,106 +8,328 @@
 #ifndef FMT_STD_H_
 #define FMT_STD_H_
 
-#include <cstdlib>
-#include <exception>
-#include <memory>
-#include <thread>
-#include <type_traits>
-#include <typeinfo>
-#include <utility>
-
+#include "format.h"
 #include "ostream.h"
 
-#if FMT_HAS_INCLUDE(<version>)
-#  include <version>
-#endif
-// Checking FMT_CPLUSPLUS for warning suppression in MSVC.
-#if FMT_CPLUSPLUS >= 201703L
-#  if FMT_HAS_INCLUDE(<filesystem>)
-#    include <filesystem>
+#ifndef FMT_MODULE
+#  include <atomic>
+#  include <bitset>
+#  include <complex>
+#  include <cstdlib>
+#  include <exception>
+#  include <memory>
+#  include <thread>
+#  include <type_traits>
+#  include <typeinfo>
+#  include <utility>
+#  include <vector>
+
+// Check FMT_CPLUSPLUS to suppress a bogus warning in MSVC.
+#  if FMT_CPLUSPLUS >= 201703L
+#    if FMT_HAS_INCLUDE(<filesystem>)
+#      include <filesystem>
+#    endif
+#    if FMT_HAS_INCLUDE(<variant>)
+#      include <variant>
+#    endif
+#    if FMT_HAS_INCLUDE(<optional>)
+#      include <optional>
+#    endif
 #  endif
-#  if FMT_HAS_INCLUDE(<variant>)
-#    include <variant>
+// Use > instead of >= in the version check because <source_location> may be
+// available after C++17 but before C++20 is marked as implemented.
+#  if FMT_CPLUSPLUS > 201703L && FMT_HAS_INCLUDE(<source_location>)
+#    include <source_location>
 #  endif
+#  if FMT_CPLUSPLUS > 202002L && FMT_HAS_INCLUDE(<expected>)
+#    include <expected>
+#  endif
+#endif  // FMT_MODULE
+
+#if FMT_HAS_INCLUDE(<version>)
+#  include <version>
 #endif
 
 // GCC 4 does not support FMT_HAS_INCLUDE.
 #if FMT_HAS_INCLUDE(<cxxabi.h>) || defined(__GLIBCXX__)
 #  include <cxxabi.h>
-// Android NDK with gabi++ library on some archtectures does not implement
+// Android NDK with gabi++ library on some architectures does not implement
 // abi::__cxa_demangle().
 #  ifndef __GABIXX_CXXABI_H__
 #    define FMT_HAS_ABI_CXA_DEMANGLE
 #  endif
 #endif
 
-#ifdef __cpp_lib_filesystem
+// For older Xcode versions, __cpp_lib_xxx flags are inaccurately defined.
+#ifndef FMT_CPP_LIB_FILESYSTEM
+#  ifdef __cpp_lib_filesystem
+#    define FMT_CPP_LIB_FILESYSTEM __cpp_lib_filesystem
+#  else
+#    define FMT_CPP_LIB_FILESYSTEM 0
+#  endif
+#endif
+
+#ifndef FMT_CPP_LIB_VARIANT
+#  ifdef __cpp_lib_variant
+#    define FMT_CPP_LIB_VARIANT __cpp_lib_variant
+#  else
+#    define FMT_CPP_LIB_VARIANT 0
+#  endif
+#endif
+
+#if FMT_CPP_LIB_FILESYSTEM
 FMT_BEGIN_NAMESPACE
 
 namespace detail {
 
-template <typename Char>
-void write_escaped_path(basic_memory_buffer<Char>& quoted,
-                        const std::filesystem::path& p) {
-  write_escaped_string<Char>(std::back_inserter(quoted), p.string<Char>());
-}
-#  ifdef _WIN32
-template <>
-inline void write_escaped_path<char>(basic_memory_buffer<char>& quoted,
-                                     const std::filesystem::path& p) {
-  auto s = p.u8string();
-  write_escaped_string<char>(
-      std::back_inserter(quoted),
-      string_view(reinterpret_cast<const char*>(s.c_str()), s.size()));
+template <typename Char, typename PathChar>
+auto get_path_string(const std::filesystem::path& p,
+                     const std::basic_string<PathChar>& native) {
+  if constexpr (std::is_same_v<Char, char> && std::is_same_v<PathChar, wchar_t>)
+    return to_utf8<wchar_t>(native, to_utf8_error_policy::replace);
+  else
+    return p.string<Char>();
 }
-#  endif
-template <>
-inline void write_escaped_path<std::filesystem::path::value_type>(
-    basic_memory_buffer<std::filesystem::path::value_type>& quoted,
-    const std::filesystem::path& p) {
-  write_escaped_string<std::filesystem::path::value_type>(
-      std::back_inserter(quoted), p.native());
+
+template <typename Char, typename PathChar>
+void write_escaped_path(basic_memory_buffer<Char>& quoted,
+                        const std::filesystem::path& p,
+                        const std::basic_string<PathChar>& native) {
+  if constexpr (std::is_same_v<Char, char> &&
+                std::is_same_v<PathChar, wchar_t>) {
+    auto buf = basic_memory_buffer<wchar_t>();
+    write_escaped_string<wchar_t>(std::back_inserter(buf), native);
+    bool valid = to_utf8<wchar_t>::convert(quoted, {buf.data(), buf.size()});
+    FMT_ASSERT(valid, "invalid utf16");
+  } else if constexpr (std::is_same_v<Char, PathChar>) {
+    write_escaped_string<std::filesystem::path::value_type>(
+        std::back_inserter(quoted), native);
+  } else {
+    write_escaped_string<Char>(std::back_inserter(quoted), p.string<Char>());
+  }
 }
 
 }  // namespace detail
 
-template <typename Char>
-struct formatter<std::filesystem::path, Char>
-    : formatter<basic_string_view<Char>> {
+FMT_EXPORT
+template <typename Char> struct formatter<std::filesystem::path, Char> {
+ private:
+  format_specs specs_;
+  detail::arg_ref<Char> width_ref_;
+  bool debug_ = false;
+  char path_type_ = 0;
+
+ public:
+  FMT_CONSTEXPR void set_debug_format(bool set = true) { debug_ = set; }
+
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end) return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it != end && *it == '?') {
+      debug_ = true;
+      ++it;
+    }
+    if (it != end && (*it == 'g')) path_type_ = detail::to_ascii(*it++);
+    return it;
+  }
+
   template <typename FormatContext>
-  auto format(const std::filesystem::path& p, FormatContext& ctx) const ->
-      typename FormatContext::iterator {
-    basic_memory_buffer<Char> quoted;
-    detail::write_escaped_path(quoted, p);
-    return formatter<basic_string_view<Char>>::format(
-        basic_string_view<Char>(quoted.data(), quoted.size()), ctx);
+  auto format(const std::filesystem::path& p, FormatContext& ctx) const {
+    auto specs = specs_;
+    auto path_string =
+        !path_type_ ? p.native()
+                    : p.generic_string<std::filesystem::path::value_type>();
+
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+    if (!debug_) {
+      auto s = detail::get_path_string<Char>(p, path_string);
+      return detail::write(ctx.out(), basic_string_view<Char>(s), specs);
+    }
+    auto quoted = basic_memory_buffer<Char>();
+    detail::write_escaped_path(quoted, p, path_string);
+    return detail::write(ctx.out(),
+                         basic_string_view<Char>(quoted.data(), quoted.size()),
+                         specs);
+  }
+};
+
+class path : public std::filesystem::path {
+ public:
+  auto display_string() const -> std::string {
+    const std::filesystem::path& base = *this;
+    return fmt::format(FMT_STRING("{}"), base);
+  }
+  auto system_string() const -> std::string { return string(); }
+
+  auto generic_display_string() const -> std::string {
+    const std::filesystem::path& base = *this;
+    return fmt::format(FMT_STRING("{:g}"), base);
   }
+  auto generic_system_string() const -> std::string { return generic_string(); }
 };
+
 FMT_END_NAMESPACE
-#endif
+#endif  // FMT_CPP_LIB_FILESYSTEM
 
 FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <std::size_t N, typename Char>
+struct formatter<std::bitset<N>, Char> : nested_formatter<string_view> {
+ private:
+  // Functor because C++11 doesn't support generic lambdas.
+  struct writer {
+    const std::bitset<N>& bs;
+
+    template <typename OutputIt>
+    FMT_CONSTEXPR auto operator()(OutputIt out) -> OutputIt {
+      for (auto pos = N; pos > 0; --pos) {
+        out = detail::write<Char>(out, bs[pos - 1] ? Char('1') : Char('0'));
+      }
+
+      return out;
+    }
+  };
+
+ public:
+  template <typename FormatContext>
+  auto format(const std::bitset<N>& bs, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return write_padded(ctx, writer{bs});
+  }
+};
+
+FMT_EXPORT
 template <typename Char>
 struct formatter<std::thread::id, Char> : basic_ostream_formatter<Char> {};
 FMT_END_NAMESPACE
 
-#ifdef __cpp_lib_variant
+#ifdef __cpp_lib_optional
 FMT_BEGIN_NAMESPACE
-template <typename Char> struct formatter<std::monostate, Char> {
+FMT_EXPORT
+template <typename T, typename Char>
+struct formatter<std::optional<T>, Char,
+                 std::enable_if_t<is_formattable<T, Char>::value>> {
+ private:
+  formatter<T, Char> underlying_;
+  static constexpr basic_string_view<Char> optional =
+      detail::string_literal<Char, 'o', 'p', 't', 'i', 'o', 'n', 'a', 'l',
+                             '('>{};
+  static constexpr basic_string_view<Char> none =
+      detail::string_literal<Char, 'n', 'o', 'n', 'e'>{};
+
+  template <class U>
+  FMT_CONSTEXPR static auto maybe_set_debug_format(U& u, bool set)
+      -> decltype(u.set_debug_format(set)) {
+    u.set_debug_format(set);
+  }
+
+  template <class U>
+  FMT_CONSTEXPR static void maybe_set_debug_format(U&, ...) {}
+
+ public:
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    maybe_set_debug_format(underlying_, true);
+    return underlying_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(const std::optional<T>& opt, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    if (!opt) return detail::write<Char>(ctx.out(), none);
+
+    auto out = ctx.out();
+    out = detail::write<Char>(out, optional);
+    ctx.advance_to(out);
+    out = underlying_.format(*opt, ctx);
+    return detail::write(out, ')');
+  }
+};
+FMT_END_NAMESPACE
+#endif  // __cpp_lib_optional
+
+#if defined(__cpp_lib_expected) || FMT_CPP_LIB_VARIANT
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename Char, typename OutputIt, typename T>
+auto write_escaped_alternative(OutputIt out, const T& v) -> OutputIt {
+  if constexpr (has_to_string_view<T>::value)
+    return write_escaped_string<Char>(out, detail::to_string_view(v));
+  if constexpr (std::is_same_v<T, Char>) return write_escaped_char(out, v);
+  return write<Char>(out, v);
+}
+
+}  // namespace detail
+
+FMT_END_NAMESPACE
+#endif
+
+#ifdef __cpp_lib_expected
+FMT_BEGIN_NAMESPACE
+
+FMT_EXPORT
+template <typename T, typename E, typename Char>
+struct formatter<std::expected<T, E>, Char,
+                 std::enable_if_t<is_formattable<T, Char>::value &&
+                                  is_formattable<E, Char>::value>> {
   template <typename ParseContext>
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     return ctx.begin();
   }
 
   template <typename FormatContext>
-  auto format(const std::monostate&, FormatContext& ctx) const
+  auto format(const std::expected<T, E>& value, FormatContext& ctx) const
       -> decltype(ctx.out()) {
     auto out = ctx.out();
-    out = detail::write<Char>(out, "monostate");
+
+    if (value.has_value()) {
+      out = detail::write<Char>(out, "expected(");
+      out = detail::write_escaped_alternative<Char>(out, *value);
+    } else {
+      out = detail::write<Char>(out, "unexpected(");
+      out = detail::write_escaped_alternative<Char>(out, value.error());
+    }
+    *out++ = ')';
     return out;
   }
 };
+FMT_END_NAMESPACE
+#endif  // __cpp_lib_expected
 
+#ifdef __cpp_lib_source_location
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <> struct formatter<std::source_location> {
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const std::source_location& loc, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    out = detail::write(out, loc.file_name());
+    out = detail::write(out, ':');
+    out = detail::write<char>(out, loc.line());
+    out = detail::write(out, ':');
+    out = detail::write<char>(out, loc.column());
+    out = detail::write(out, ": ");
+    out = detail::write(out, loc.function_name());
+    return out;
+  }
+};
+FMT_END_NAMESPACE
+#endif
+
+#if FMT_CPP_LIB_VARIANT
+FMT_BEGIN_NAMESPACE
 namespace detail {
 
 template <typename T>
@@ -130,16 +352,6 @@ template <typename T, typename C> class is_variant_formattable_ {
       decltype(check(variant_index_sequence<T>{}))::value;
 };
 
-template <typename Char, typename OutputIt, typename T>
-auto write_variant_alternative(OutputIt out, const T& v) -> OutputIt {
-  if constexpr (is_string<T>::value)
-    return write_escaped_string<Char>(out, detail::to_string_view(v));
-  else if constexpr (std::is_same_v<T, Char>)
-    return write_escaped_char(out, v);
-  else
-    return write<Char>(out, v);
-}
-
 }  // namespace detail
 
 template <typename T> struct is_variant_like {
@@ -151,6 +363,21 @@ template <typename T, typename C> struct is_variant_formattable {
       detail::is_variant_formattable_<T, C>::value;
 };
 
+FMT_EXPORT
+template <typename Char> struct formatter<std::monostate, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const std::monostate&, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return detail::write<Char>(ctx.out(), "monostate");
+  }
+};
+
+FMT_EXPORT
 template <typename Variant, typename Char>
 struct formatter<
     Variant, Char,
@@ -167,20 +394,25 @@ struct formatter<
     auto out = ctx.out();
 
     out = detail::write<Char>(out, "variant(");
-    std::visit(
-        [&](const auto& v) {
-          out = detail::write_variant_alternative<Char>(out, v);
-        },
-        value);
+    FMT_TRY {
+      std::visit(
+          [&](const auto& v) {
+            out = detail::write_escaped_alternative<Char>(out, v);
+          },
+          value);
+    }
+    FMT_CATCH(const std::bad_variant_access&) {
+      detail::write<Char>(out, "valueless by exception");
+    }
     *out++ = ')';
     return out;
   }
 };
 FMT_END_NAMESPACE
-#endif  // __cpp_lib_variant
+#endif  // FMT_CPP_LIB_VARIANT
 
 FMT_BEGIN_NAMESPACE
-
+FMT_EXPORT
 template <typename Char> struct formatter<std::error_code, Char> {
   template <typename ParseContext>
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
@@ -191,17 +423,106 @@ template <typename Char> struct formatter<std::error_code, Char> {
   FMT_CONSTEXPR auto format(const std::error_code& ec, FormatContext& ctx) const
       -> decltype(ctx.out()) {
     auto out = ctx.out();
-    out = detail::write_bytes(out, ec.category().name(),
-                              basic_format_specs<Char>());
+    out = detail::write_bytes<Char>(out, ec.category().name(), format_specs());
     out = detail::write<Char>(out, Char(':'));
     out = detail::write<Char>(out, ec.value());
     return out;
   }
 };
 
+#if FMT_USE_RTTI
+namespace detail {
+
+template <typename Char, typename OutputIt>
+auto write_demangled_name(OutputIt out, const std::type_info& ti) -> OutputIt {
+#  ifdef FMT_HAS_ABI_CXA_DEMANGLE
+  int status = 0;
+  std::size_t size = 0;
+  std::unique_ptr<char, void (*)(void*)> demangled_name_ptr(
+      abi::__cxa_demangle(ti.name(), nullptr, &size, &status), &std::free);
+
+  string_view demangled_name_view;
+  if (demangled_name_ptr) {
+    demangled_name_view = demangled_name_ptr.get();
+
+    // Normalization of stdlib inline namespace names.
+    // libc++ inline namespaces.
+    //  std::__1::*       -> std::*
+    //  std::__1::__fs::* -> std::*
+    // libstdc++ inline namespaces.
+    //  std::__cxx11::*             -> std::*
+    //  std::filesystem::__cxx11::* -> std::filesystem::*
+    if (demangled_name_view.starts_with("std::")) {
+      char* begin = demangled_name_ptr.get();
+      char* to = begin + 5;  // std::
+      for (char *from = to, *end = begin + demangled_name_view.size();
+           from < end;) {
+        // This is safe, because demangled_name is NUL-terminated.
+        if (from[0] == '_' && from[1] == '_') {
+          char* next = from + 1;
+          while (next < end && *next != ':') next++;
+          if (next[0] == ':' && next[1] == ':') {
+            from = next + 2;
+            continue;
+          }
+        }
+        *to++ = *from++;
+      }
+      demangled_name_view = {begin, detail::to_unsigned(to - begin)};
+    }
+  } else {
+    demangled_name_view = string_view(ti.name());
+  }
+  return detail::write_bytes<Char>(out, demangled_name_view);
+#  elif FMT_MSC_VERSION
+  const string_view demangled_name(ti.name());
+  for (std::size_t i = 0; i < demangled_name.size(); ++i) {
+    auto sub = demangled_name;
+    sub.remove_prefix(i);
+    if (sub.starts_with("enum ")) {
+      i += 4;
+      continue;
+    }
+    if (sub.starts_with("class ") || sub.starts_with("union ")) {
+      i += 5;
+      continue;
+    }
+    if (sub.starts_with("struct ")) {
+      i += 6;
+      continue;
+    }
+    if (*sub.begin() != ' ') *out++ = *sub.begin();
+  }
+  return out;
+#  else
+  return detail::write_bytes<Char>(out, string_view(ti.name()));
+#  endif
+}
+
+}  // namespace detail
+
+FMT_EXPORT
+template <typename Char>
+struct formatter<std::type_info, Char  // DEPRECATED! Mixing code unit types.
+                 > {
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename Context>
+  auto format(const std::type_info& ti, Context& ctx) const
+      -> decltype(ctx.out()) {
+    return detail::write_demangled_name<Char>(ctx.out(), ti);
+  }
+};
+#endif
+
+FMT_EXPORT
 template <typename T, typename Char>
 struct formatter<
-    T, Char,
+    T, Char,  // DEPRECATED! Mixing code unit types.
     typename std::enable_if<std::is_base_of<std::exception, T>::value>::type> {
  private:
   bool with_typename_ = false;
@@ -214,76 +535,165 @@ struct formatter<
     if (it == end || *it == '}') return it;
     if (*it == 't') {
       ++it;
-      with_typename_ = true;
+      with_typename_ = FMT_USE_RTTI != 0;
     }
     return it;
   }
 
-  template <typename OutputIt>
-  auto format(const std::exception& ex,
-              basic_format_context<OutputIt, Char>& ctx) const -> OutputIt {
-    basic_format_specs<Char> spec;
+  template <typename Context>
+  auto format(const std::exception& ex, Context& ctx) const
+      -> decltype(ctx.out()) {
     auto out = ctx.out();
-    if (!with_typename_)
-      return detail::write_bytes(out, string_view(ex.what()), spec);
-
-    const std::type_info& ti = typeid(ex);
-#ifdef FMT_HAS_ABI_CXA_DEMANGLE
-    int status = 0;
-    std::size_t size = 0;
-    std::unique_ptr<char, decltype(&std::free)> demangled_name_ptr(
-        abi::__cxa_demangle(ti.name(), nullptr, &size, &status), &std::free);
-
-    string_view demangled_name_view;
-    if (demangled_name_ptr) {
-      demangled_name_view = demangled_name_ptr.get();
-
-      // Normalization of stdlib inline namespace names.
-      // libc++ inline namespaces.
-      //  std::__1::*       -> std::*
-      //  std::__1::__fs::* -> std::*
-      // libstdc++ inline namespaces.
-      //  std::__cxx11::*             -> std::*
-      //  std::filesystem::__cxx11::* -> std::filesystem::*
-      if (demangled_name_view.starts_with("std::")) {
-        char* begin = demangled_name_ptr.get();
-        char* to = begin + 5;  // std::
-        for (char *from = to, *end = begin + demangled_name_view.size();
-             from < end;) {
-          // This is safe, because demangled_name is NUL-terminated.
-          if (from[0] == '_' && from[1] == '_') {
-            char* next = from + 1;
-            while (next < end && *next != ':') next++;
-            if (next[0] == ':' && next[1] == ':') {
-              from = next + 2;
-              continue;
-            }
-          }
-          *to++ = *from++;
-        }
-        demangled_name_view = {begin, detail::to_unsigned(to - begin)};
-      }
-    } else {
-      demangled_name_view = string_view(ti.name());
+#if FMT_USE_RTTI
+    if (with_typename_) {
+      out = detail::write_demangled_name<Char>(out, typeid(ex));
+      *out++ = ':';
+      *out++ = ' ';
     }
-    out = detail::write_bytes(out, demangled_name_view, spec);
-#elif FMT_MSC_VERSION
-    string_view demangled_name_view(ti.name());
-    if (demangled_name_view.starts_with("class "))
-      demangled_name_view.remove_prefix(6);
-    else if (demangled_name_view.starts_with("struct "))
-      demangled_name_view.remove_prefix(7);
-    out = detail::write_bytes(out, demangled_name_view, spec);
-#else
-    out = detail::write_bytes(out, string_view(ti.name()), spec);
 #endif
-    out = detail::write<Char>(out, Char(':'));
-    out = detail::write<Char>(out, Char(' '));
-    out = detail::write_bytes(out, string_view(ex.what()), spec);
+    return detail::write_bytes<Char>(out, string_view(ex.what()));
+  }
+};
 
+namespace detail {
+
+template <typename T, typename Enable = void>
+struct has_flip : std::false_type {};
+
+template <typename T>
+struct has_flip<T, void_t<decltype(std::declval<T>().flip())>>
+    : std::true_type {};
+
+template <typename T> struct is_bit_reference_like {
+  static constexpr const bool value =
+      std::is_convertible<T, bool>::value &&
+      std::is_nothrow_assignable<T, bool>::value && has_flip<T>::value;
+};
+
+#ifdef _LIBCPP_VERSION
+
+// Workaround for libc++ incompatibility with C++ standard.
+// According to the Standard, `bitset::operator[] const` returns bool.
+template <typename C>
+struct is_bit_reference_like<std::__bit_const_reference<C>> {
+  static constexpr const bool value = true;
+};
+
+#endif
+
+}  // namespace detail
+
+// We can't use std::vector<bool, Allocator>::reference and
+// std::bitset<N>::reference because the compiler can't deduce Allocator and N
+// in partial specialization.
+FMT_EXPORT
+template <typename BitRef, typename Char>
+struct formatter<BitRef, Char,
+                 enable_if_t<detail::is_bit_reference_like<BitRef>::value>>
+    : formatter<bool, Char> {
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const BitRef& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<bool, Char>::format(v, ctx);
+  }
+};
+
+template <typename T, typename Deleter>
+auto ptr(const std::unique_ptr<T, Deleter>& p) -> const void* {
+  return p.get();
+}
+template <typename T> auto ptr(const std::shared_ptr<T>& p) -> const void* {
+  return p.get();
+}
+
+FMT_EXPORT
+template <typename T, typename Char>
+struct formatter<std::atomic<T>, Char,
+                 enable_if_t<is_formattable<T, Char>::value>>
+    : formatter<T, Char> {
+  template <typename FormatContext>
+  auto format(const std::atomic<T>& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<T, Char>::format(v.load(), ctx);
+  }
+};
+
+#ifdef __cpp_lib_atomic_flag_test
+FMT_EXPORT
+template <typename Char>
+struct formatter<std::atomic_flag, Char> : formatter<bool, Char> {
+  template <typename FormatContext>
+  auto format(const std::atomic_flag& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<bool, Char>::format(v.test(), ctx);
+  }
+};
+#endif  // __cpp_lib_atomic_flag_test
+
+FMT_EXPORT
+template <typename T, typename Char> struct formatter<std::complex<T>, Char> {
+ private:
+  detail::dynamic_format_specs<Char> specs_;
+
+  template <typename FormatContext, typename OutputIt>
+  FMT_CONSTEXPR auto do_format(const std::complex<T>& c,
+                               detail::dynamic_format_specs<Char>& specs,
+                               FormatContext& ctx, OutputIt out) const
+      -> OutputIt {
+    if (c.real() != 0) {
+      *out++ = Char('(');
+      out = detail::write<Char>(out, c.real(), specs, ctx.locale());
+      specs.sign = sign::plus;
+      out = detail::write<Char>(out, c.imag(), specs, ctx.locale());
+      if (!detail::isfinite(c.imag())) *out++ = Char(' ');
+      *out++ = Char('i');
+      *out++ = Char(')');
+      return out;
+    }
+    out = detail::write<Char>(out, c.imag(), specs, ctx.locale());
+    if (!detail::isfinite(c.imag())) *out++ = Char(' ');
+    *out++ = Char('i');
     return out;
   }
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    if (ctx.begin() == ctx.end() || *ctx.begin() == '}') return ctx.begin();
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type_constant<T, Char>::value);
+  }
+
+  template <typename FormatContext>
+  auto format(const std::complex<T>& c, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto specs = specs_;
+    if (specs.width_ref.kind != detail::arg_id_kind::none ||
+        specs.precision_ref.kind != detail::arg_id_kind::none) {
+      detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                         specs.width_ref, ctx);
+      detail::handle_dynamic_spec<detail::precision_checker>(
+          specs.precision, specs.precision_ref, ctx);
+    }
+
+    if (specs.width == 0) return do_format(c, specs, ctx, ctx.out());
+    auto buf = basic_memory_buffer<Char>();
+
+    auto outer_specs = format_specs();
+    outer_specs.width = specs.width;
+    outer_specs.fill = specs.fill;
+    outer_specs.align = specs.align;
+
+    specs.width = 0;
+    specs.fill = {};
+    specs.align = align::none;
+
+    do_format(c, specs, ctx, basic_appender<Char>(buf));
+    return detail::write<Char>(ctx.out(),
+                               basic_string_view<Char>(buf.data(), buf.size()),
+                               outer_specs);
+  }
 };
-FMT_END_NAMESPACE
 
+FMT_END_NAMESPACE
 #endif  // FMT_STD_H_
diff --git a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/xchar.h b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/xchar.h
index 40e699b64253..b1f39ed22202 100644
--- a/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/xchar.h
+++ b/packages/seacas/libraries/ioss/src/private_copy_fmt/fmt/xchar.h
@@ -8,12 +8,15 @@
 #ifndef FMT_XCHAR_H_
 #define FMT_XCHAR_H_
 
-#include <cwchar>
-
+#include "color.h"
 #include "format.h"
+#include "ranges.h"
 
-#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
-#  include <locale>
+#ifndef FMT_MODULE
+#  include <cwchar>
+#  if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+#    include <locale>
+#  endif
 #endif
 
 FMT_BEGIN_NAMESPACE
@@ -22,9 +25,24 @@ namespace detail {
 template <typename T>
 using is_exotic_char = bool_constant<!std::is_same<T, char>::value>;
 
-inline auto write_loc(std::back_insert_iterator<detail::buffer<wchar_t>> out,
-                      loc_value value, const basic_format_specs<wchar_t>& specs,
-                      locale_ref loc) -> bool {
+template <typename S, typename = void> struct format_string_char {};
+
+template <typename S>
+struct format_string_char<
+    S, void_t<decltype(sizeof(detail::to_string_view(std::declval<S>())))>> {
+  using type = char_t<S>;
+};
+
+template <typename S>
+struct format_string_char<S, enable_if_t<is_compile_string<S>::value>> {
+  using type = typename S::char_type;
+};
+
+template <typename S>
+using format_string_char_t = typename format_string_char<S>::type;
+
+inline auto write_loc(basic_appender<wchar_t> out, loc_value value,
+                      const format_specs& specs, locale_ref loc) -> bool {
 #ifndef FMT_STATIC_THOUSANDS_SEPARATOR
   auto& numpunct =
       std::use_facet<std::numpunct<wchar_t>>(loc.get<std::locale>());
@@ -37,11 +55,11 @@ inline auto write_loc(std::back_insert_iterator<detail::buffer<wchar_t>> out,
 }
 }  // namespace detail
 
-FMT_MODULE_EXPORT_BEGIN
+FMT_BEGIN_EXPORT
 
 using wstring_view = basic_string_view<wchar_t>;
 using wformat_parse_context = basic_format_parse_context<wchar_t>;
-using wformat_context = buffer_context<wchar_t>;
+using wformat_context = buffered_context<wchar_t>;
 using wformat_args = basic_format_args<wformat_context>;
 using wmemory_buffer = basic_memory_buffer<wchar_t>;
 
@@ -52,23 +70,30 @@ inline auto runtime(wstring_view s) -> wstring_view { return s; }
 #else
 template <typename... Args>
 using wformat_string = basic_format_string<wchar_t, type_identity_t<Args>...>;
-inline auto runtime(wstring_view s) -> basic_runtime<wchar_t> { return {{s}}; }
+inline auto runtime(wstring_view s) -> runtime_format_string<wchar_t> {
+  return {{s}};
+}
 #endif
 
 template <> struct is_char<wchar_t> : std::true_type {};
-template <> struct is_char<detail::char8_type> : std::true_type {};
 template <> struct is_char<char16_t> : std::true_type {};
 template <> struct is_char<char32_t> : std::true_type {};
 
-template <typename... Args>
-constexpr format_arg_store<wformat_context, Args...> make_wformat_args(
-    const Args&... args) {
-  return {args...};
+#ifdef __cpp_char8_t
+template <>
+struct is_char<char8_t> : bool_constant<detail::is_utf8_enabled()> {};
+#endif
+
+template <typename... T>
+constexpr auto make_wformat_args(T&... args)
+    -> decltype(fmt::make_format_args<wformat_context>(args...)) {
+  return fmt::make_format_args<wformat_context>(args...);
 }
 
 inline namespace literals {
 #if FMT_USE_USER_DEFINED_LITERALS && !FMT_USE_NONTYPE_TEMPLATE_ARGS
-constexpr detail::udl_arg<wchar_t> operator"" _a(const wchar_t* s, size_t) {
+constexpr auto operator""_a(const wchar_t* s, size_t)
+    -> detail::udl_arg<wchar_t> {
   return {s};
 }
 #endif
@@ -93,13 +118,19 @@ auto join(std::initializer_list<T> list, wstring_view sep)
   return join(std::begin(list), std::end(list), sep);
 }
 
+template <typename... T>
+auto join(const std::tuple<T...>& tuple, basic_string_view<wchar_t> sep)
+    -> tuple_join_view<wchar_t, T...> {
+  return {tuple, sep};
+}
+
 template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
 auto vformat(basic_string_view<Char> format_str,
-             basic_format_args<buffer_context<type_identity_t<Char>>> args)
+             typename detail::vformat_args<Char>::type args)
     -> std::basic_string<Char> {
-  basic_memory_buffer<Char> buffer;
-  detail::vformat_to(buffer, format_str, args);
-  return to_string(buffer);
+  auto buf = basic_memory_buffer<Char>();
+  detail::vformat_to(buf, format_str, args);
+  return to_string(buf);
 }
 
 template <typename... T>
@@ -107,122 +138,131 @@ auto format(wformat_string<T...> fmt, T&&... args) -> std::wstring {
   return vformat(fmt::wstring_view(fmt), fmt::make_wformat_args(args...));
 }
 
+template <typename OutputIt, typename... T>
+auto format_to(OutputIt out, wformat_string<T...> fmt, T&&... args)
+    -> OutputIt {
+  return vformat_to(out, fmt::wstring_view(fmt),
+                    fmt::make_wformat_args(args...));
+}
+
 // Pass char_t as a default template parameter instead of using
 // std::basic_string<char_t<S>> to reduce the symbol size.
-template <typename S, typename... Args, typename Char = char_t<S>,
+template <typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
           FMT_ENABLE_IF(!std::is_same<Char, char>::value &&
                         !std::is_same<Char, wchar_t>::value)>
-auto format(const S& format_str, Args&&... args) -> std::basic_string<Char> {
+auto format(const S& format_str, T&&... args) -> std::basic_string<Char> {
   return vformat(detail::to_string_view(format_str),
-                 fmt::make_format_args<buffer_context<Char>>(args...));
+                 fmt::make_format_args<buffered_context<Char>>(args...));
 }
 
-template <typename Locale, typename S, typename Char = char_t<S>,
+template <typename Locale, typename S,
+          typename Char = detail::format_string_char_t<S>,
           FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
                             detail::is_exotic_char<Char>::value)>
-inline auto vformat(
-    const Locale& loc, const S& format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+inline auto vformat(const Locale& loc, const S& format_str,
+                    typename detail::vformat_args<Char>::type args)
     -> std::basic_string<Char> {
   return detail::vformat(loc, detail::to_string_view(format_str), args);
 }
 
-template <typename Locale, typename S, typename... Args,
-          typename Char = char_t<S>,
+template <typename Locale, typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
           FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
                             detail::is_exotic_char<Char>::value)>
-inline auto format(const Locale& loc, const S& format_str, Args&&... args)
+inline auto format(const Locale& loc, const S& format_str, T&&... args)
     -> std::basic_string<Char> {
-  return detail::vformat(loc, detail::to_string_view(format_str),
-                         fmt::make_format_args<buffer_context<Char>>(args...));
+  return detail::vformat(
+      loc, detail::to_string_view(format_str),
+      fmt::make_format_args<buffered_context<Char>>(args...));
 }
 
-template <typename OutputIt, typename S, typename Char = char_t<S>,
+template <typename OutputIt, typename S,
+          typename Char = detail::format_string_char_t<S>,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
                             detail::is_exotic_char<Char>::value)>
 auto vformat_to(OutputIt out, const S& format_str,
-                basic_format_args<buffer_context<type_identity_t<Char>>> args)
-    -> OutputIt {
+                typename detail::vformat_args<Char>::type args) -> OutputIt {
   auto&& buf = detail::get_buffer<Char>(out);
   detail::vformat_to(buf, detail::to_string_view(format_str), args);
   return detail::get_iterator(buf, out);
 }
 
-template <typename OutputIt, typename S, typename... Args,
-          typename Char = char_t<S>,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
-                            detail::is_exotic_char<Char>::value)>
-inline auto format_to(OutputIt out, const S& fmt, Args&&... args) -> OutputIt {
+template <typename OutputIt, typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value &&
+                        !std::is_same<Char, char>::value &&
+                        !std::is_same<Char, wchar_t>::value)>
+inline auto format_to(OutputIt out, const S& fmt, T&&... args) -> OutputIt {
   return vformat_to(out, detail::to_string_view(fmt),
-                    fmt::make_format_args<buffer_context<Char>>(args...));
+                    fmt::make_format_args<buffered_context<Char>>(args...));
 }
 
 template <typename Locale, typename S, typename OutputIt, typename... Args,
-          typename Char = char_t<S>,
+          typename Char = detail::format_string_char_t<S>,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
                             detail::is_locale<Locale>::value&&
                                 detail::is_exotic_char<Char>::value)>
-inline auto vformat_to(
-    OutputIt out, const Locale& loc, const S& format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) -> OutputIt {
+inline auto vformat_to(OutputIt out, const Locale& loc, const S& format_str,
+                       typename detail::vformat_args<Char>::type args)
+    -> OutputIt {
   auto&& buf = detail::get_buffer<Char>(out);
   vformat_to(buf, detail::to_string_view(format_str), args,
              detail::locale_ref(loc));
-  return detail::get_iterator(buf);
+  return detail::get_iterator(buf, out);
 }
 
-template <
-    typename OutputIt, typename Locale, typename S, typename... Args,
-    typename Char = char_t<S>,
-    bool enable = detail::is_output_iterator<OutputIt, Char>::value&&
-        detail::is_locale<Locale>::value&& detail::is_exotic_char<Char>::value>
+template <typename OutputIt, typename Locale, typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
+          bool enable = detail::is_output_iterator<OutputIt, Char>::value &&
+                        detail::is_locale<Locale>::value &&
+                        detail::is_exotic_char<Char>::value>
 inline auto format_to(OutputIt out, const Locale& loc, const S& format_str,
-                      Args&&... args) ->
+                      T&&... args) ->
     typename std::enable_if<enable, OutputIt>::type {
-  return vformat_to(out, loc, to_string_view(format_str),
-                    fmt::make_format_args<buffer_context<Char>>(args...));
+  return vformat_to(out, loc, detail::to_string_view(format_str),
+                    fmt::make_format_args<buffered_context<Char>>(args...));
 }
 
 template <typename OutputIt, typename Char, typename... Args,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
                             detail::is_exotic_char<Char>::value)>
-inline auto vformat_to_n(
-    OutputIt out, size_t n, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+inline auto vformat_to_n(OutputIt out, size_t n,
+                         basic_string_view<Char> format_str,
+                         typename detail::vformat_args<Char>::type args)
     -> format_to_n_result<OutputIt> {
-  detail::iterator_buffer<OutputIt, Char, detail::fixed_buffer_traits> buf(out,
-                                                                           n);
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, Char, traits>(out, n);
   detail::vformat_to(buf, format_str, args);
   return {buf.out(), buf.count()};
 }
 
-template <typename OutputIt, typename S, typename... Args,
-          typename Char = char_t<S>,
+template <typename OutputIt, typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
           FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
                             detail::is_exotic_char<Char>::value)>
-inline auto format_to_n(OutputIt out, size_t n, const S& fmt,
-                        const Args&... args) -> format_to_n_result<OutputIt> {
-  return vformat_to_n(out, n, detail::to_string_view(fmt),
-                      fmt::make_format_args<buffer_context<Char>>(args...));
+inline auto format_to_n(OutputIt out, size_t n, const S& fmt, T&&... args)
+    -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, fmt::basic_string_view<Char>(fmt),
+                      fmt::make_format_args<buffered_context<Char>>(args...));
 }
 
-template <typename S, typename... Args, typename Char = char_t<S>,
+template <typename S, typename... T,
+          typename Char = detail::format_string_char_t<S>,
           FMT_ENABLE_IF(detail::is_exotic_char<Char>::value)>
-inline auto formatted_size(const S& fmt, Args&&... args) -> size_t {
-  detail::counting_buffer<Char> buf;
+inline auto formatted_size(const S& fmt, T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<Char>();
   detail::vformat_to(buf, detail::to_string_view(fmt),
-                     fmt::make_format_args<buffer_context<Char>>(args...));
+                     fmt::make_format_args<buffered_context<Char>>(args...));
   return buf.count();
 }
 
 inline void vprint(std::FILE* f, wstring_view fmt, wformat_args args) {
-  wmemory_buffer buffer;
-  detail::vformat_to(buffer, fmt, args);
-  buffer.push_back(L'\0');
-#if !__NVCC__
-  if (std::fputws(buffer.data(), f) == -1)
+  auto buf = wmemory_buffer();
+  detail::vformat_to(buf, fmt, args);
+  buf.push_back(L'\0');
+  if (std::fputws(buf.data(), f) == -1)
     FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
-#endif
 }
 
 inline void vprint(wstring_view fmt, wformat_args args) {
@@ -238,13 +278,45 @@ template <typename... T> void print(wformat_string<T...> fmt, T&&... args) {
   return vprint(wstring_view(fmt), fmt::make_wformat_args(args...));
 }
 
-/**
-  Converts *value* to ``std::wstring`` using the default format for type *T*.
- */
+template <typename... T>
+void println(std::FILE* f, wformat_string<T...> fmt, T&&... args) {
+  return print(f, L"{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+template <typename... T> void println(wformat_string<T...> fmt, T&&... args) {
+  return print(L"{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+inline auto vformat(const text_style& ts, wstring_view fmt, wformat_args args)
+    -> std::wstring {
+  auto buf = wmemory_buffer();
+  detail::vformat_to(buf, ts, fmt, args);
+  return fmt::to_string(buf);
+}
+
+template <typename... T>
+inline auto format(const text_style& ts, wformat_string<T...> fmt, T&&... args)
+    -> std::wstring {
+  return fmt::vformat(ts, fmt, fmt::make_wformat_args(args...));
+}
+
+template <typename... T>
+FMT_DEPRECATED void print(std::FILE* f, const text_style& ts,
+                          wformat_string<T...> fmt, const T&... args) {
+  vprint(f, ts, fmt, fmt::make_wformat_args(args...));
+}
+
+template <typename... T>
+FMT_DEPRECATED void print(const text_style& ts, wformat_string<T...> fmt,
+                          const T&... args) {
+  return print(stdout, ts, fmt, args...);
+}
+
+/// Converts `value` to `std::wstring` using the default format for type `T`.
 template <typename T> inline auto to_wstring(const T& value) -> std::wstring {
   return format(FMT_STRING(L"{}"), value);
 }
-FMT_MODULE_EXPORT_END
+FMT_END_EXPORT
 FMT_END_NAMESPACE
 
 #endif  // FMT_XCHAR_H_
diff --git a/packages/seacas/libraries/ioss/src/visualization/cgns/Iovs_cgns_DatabaseIO.C b/packages/seacas/libraries/ioss/src/visualization/cgns/Iovs_cgns_DatabaseIO.C
index c42e1d37d45e..05abe5fafece 100644
--- a/packages/seacas/libraries/ioss/src/visualization/cgns/Iovs_cgns_DatabaseIO.C
+++ b/packages/seacas/libraries/ioss/src/visualization/cgns/Iovs_cgns_DatabaseIO.C
@@ -1,4 +1,4 @@
-// Copyright(C) 1999-2021, 2023 National Technology & Engineering Solutions
+// Copyright(C) 1999-2021, 2023, 2024 National Technology & Engineering Solutions
 // of Sandia, LLC (NTESS).  Under the terms of Contract DE-NA0003525 with
 // NTESS, the U.S. Government retains certain rights in this software.
 //
diff --git a/packages/seacas/libraries/suplib_cpp/iqsort.C b/packages/seacas/libraries/suplib_cpp/iqsort.C
index a4350b37cdc0..9df50763e151 100644
--- a/packages/seacas/libraries/suplib_cpp/iqsort.C
+++ b/packages/seacas/libraries/suplib_cpp/iqsort.C
@@ -1,4 +1,4 @@
-// Copyright(C) 1999-2020, 2023 National Technology & Engineering Solutions
+// Copyright(C) 1999-2020, 2023, 2024 National Technology & Engineering Solutions
 // of Sandia, LLC (NTESS).  Under the terms of Contract DE-NA0003525 with
 // NTESS, the U.S. Government retains certain rights in this software.
 //
@@ -155,7 +155,7 @@ namespace {
 
   template <typename T, typename INT> void check(const T v[], INT iv[], size_t N)
   {
-    fmt::print(stderr, "Checking sort of {:n} values\n", N + 1);
+    fmt::print(stderr, fmt::runtime("Checking sort of {:n} values\n"), N + 1);
     size_t i;
     for (i = 1; i < N; i++) {
       SMART_ASSERT(v[iv[i - 1]] <= v[iv[i]]);
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_MatvecFence.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_MatvecFence.cpp
index c1c06d2b3b30..b9da413db133 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_MatvecFence.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_MatvecFence.cpp
@@ -200,9 +200,17 @@ namespace {
         // Did not test the case of Serial node in build with Serial and OpenMP and GPU-aware
         expectedGlobalCount = iter_num;
         if (Tpetra::Details::Behavior::debug()) {
+#if KOKKOS_VERSION >= 40499
+          expectedInstanceCount = 3*iter_num;
+#else
           expectedInstanceCount = 4*iter_num;
+#endif
         } else {
+#if KOKKOS_VERSION >= 40499
+          expectedInstanceCount = 2*iter_num;
+#else
           expectedInstanceCount = 3*iter_num;
+#endif
         }
       }
 #endif