WIP: use SIMD.jl for partials #570

KristofferC · 2021-12-13T09:50:17Z

This needs quite well testing for downstream users to ensure regressions haven't crept in due to the implementation in SIMD.jl.

codecov-commenter · 2021-12-13T11:04:44Z

Codecov Report

Patch coverage: 100.00% and project coverage change: -1.33% ⚠️

Comparison is base (61e4dd4) 86.71% compared to head (e64760e) 85.38%.
Report is 27 commits behind head on master.

❗ Current head e64760e differs from pull request most recent head 39bbd95. Consider uploading reports for the commit 39bbd95 to get more accurate results

Additional details and impacted files

@@            Coverage Diff             @@
##           master     #570      +/-   ##
==========================================
- Coverage   86.71%   85.38%   -1.33%     
==========================================
  Files           9        9              
  Lines         903      869      -34     
==========================================
- Hits          783      742      -41     
- Misses        120      127       +7

Files Changed	Coverage Δ
src/dual.jl	`74.83% <100.00%> (-4.93%)`	⬇️
src/partials.jl	`84.48% <100.00%> (+0.13%)`	⬆️

... and 1 file with indirect coverage changes

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

chriselrod · 2023-07-04T04:18:26Z

PR + merged master locally:

 * time: 4.9393510818481445
 41.416090 seconds (90.34 M allocations: 5.079 GiB, 4.08% gc time, 162.06% compilation time: <1% of which was recompilation)
  4.908872 seconds (28.16 M allocations: 1.333 GiB, 2.66% gc time, 0.20% compilation time)
  4.811479 seconds (28.15 M allocations: 1.333 GiB, 2.55% gc time)
  4.728634 seconds (28.15 M allocations: 1.333 GiB, 2.41% gc time)
90.008 secs

Master:

 * time: 4.6424009799957275
 36.167679 seconds (81.11 M allocations: 4.641 GiB, 4.30% gc time, 176.86% compilation time: <1% of which was recompilation)
  4.709468 seconds (28.18 M allocations: 1.329 GiB, 2.68% gc time, 0.21% compilation time)
  4.612684 seconds (28.17 M allocations: 1.329 GiB, 2.28% gc time)
  4.640914 seconds (28.17 M allocations: 1.329 GiB, 2.46% gc time)
83.842 secs

I do think we should try to get some form of this in, but I don't see an improvement in the current form.

chriselrod · 2023-07-04T05:39:55Z

Here's a simpler benchmark:

using ForwardDiff, BenchmarkTools
function _matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple{1}) where {T}
  M = size(A, 1)
  te = T(last(t))
  @inbounds for n = 1:M, m = 1:M
    B[m, n] = ifelse(m == n, te, zero(te))
  end
  @inbounds for n = 1:M, k = 1:M, m = 1:M
    B[m, n] = muladd(A[m, k], D[k, n], B[m, n])
  end
  return B
end
function _matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple) where {T}
  M = size(A, 1)
  te = T(last(t))
  @inbounds for n = 1:M, m = 1:M
    C[m, n] = ifelse(m == n, te, zero(te))
  end
  @inbounds for n = 1:M, k = 1:M, m = 1:M
    C[m, n] = muladd(A[m, k], D[k, n], C[m, n])
  end
  _matevalpoly!(B, D, C, A, Base.front(t))
end
function matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple) where {T}
  t1 = Base.front(t)
  te = T(last(t))
  tp = T(last(t1))
  @inbounds for j in axes(A, 2), i in axes(A, 1)
    D[i, j] = muladd(A[i, j], te, ifelse(i == j, tp, zero(tp)))
  end
  _matevalpoly!(B, C, D, A, Base.front(t1))
end
function matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple{2}) where {T}
  t1 = Base.front(t)
  te = T(last(t))
  tp = T(last(t1))
  @inbounds for j in axes(A, 2), i in axes(A, 1)
    B[i, j] = muladd(A[i, j], te, ifelse(i == j, tp, zero(tp)))
  end
  return B
end

naive_matmul!(C, A, B) =
  for n in axes(C, 2), m in axes(C, 1)
    Cmn = zero(eltype(C))
    for k in axes(A, 2)
      Cmn += A[m, k] * B[k, n]
    end
    C[m, n] = Cmn
  end
naive_matmuladd!(C, A, B) =
  for n in axes(C, 2), m in axes(C, 1)
    Cmn = zero(eltype(C))
    for k in axes(A, 2)
      Cmn += A[m, k] * B[k, n]
    end
    C[m, n] += Cmn
  end

function expm!(
  Z::AbstractMatrix,
  A::AbstractMatrix,
  matmul! = naive_matmul!,
  matmuladd! = naive_matmuladd!
)
  # omitted: matrix balancing, i.e., LAPACK.gebal!
  # nA = maximum(sum(abs.(A); dims=Val(1)))    # marginally more performant than norm(A, 1)
  nA = opnorm(A, 1)
  N = LinearAlgebra.checksquare(A)
  # B and C are temporaries
  ## For sufficiently small nA, use lower order Padé-Approximations
  if nA <= 2.1
    U = Z
    V = similar(A)
    A2 = A * A
    if nA <= 0.015
      matevalpoly!(V, nothing, nothing, A2, (60, 1))
      matmul!(U, A, V)
      matevalpoly!(V, nothing, nothing, A2, (120, 12))
    else
      B = similar(A)
      if nA <= 0.25
        matevalpoly!(V, nothing, U, A2, (15120, 420, 1))
        matmul!(U, A, V)
        matevalpoly!(V, nothing, B, A2, (30240, 3360, 30))
      else
        C = similar(A)
        if nA <= 0.95
          matevalpoly!(V, C, U, A2, (8648640, 277200, 1512, 1))
          matmul!(U, A, V)
          matevalpoly!(V, B, C, A2, (17297280, 1995840, 25200, 56))
        else
          matevalpoly!(V, C, U, A2, (8821612800, 302702400, 2162160, 3960, 1))
          matmul!(U, A, V)
          matevalpoly!(
            V,
            B,
            C,
            A2,
            (17643225600, 2075673600, 30270240, 110880, 90)
          )
        end
      end
    end
    @inbounds for m = 1:N*N
      u = U[m]
      v = V[m]
      U[m] = v + u
      V[m] = v - u
    end
    ldiv!(lu!(V), U)
    expA = U
    # expA = (V - U) \ (V + U)
  else
    s = log2(nA / 5.4)               # power of 2 later reversed by squaring
    if s > 0
      si = ceil(Int, s)
      A = A / exp2(si)
    end
    A2 = A * A
    A4 = A2 * A2
    A6 = A2 * A4

    U = Z
    B = zero(A)
    @inbounds for m = 1:N
      B[m, m] = 32382376266240000
    end
    @inbounds for m = 1:N*N
      a6 = A6[m]
      a4 = A4[m]
      a2 = A2[m]
      B[m] = muladd(
        33522128640,
        a6,
        muladd(10559470521600, a4, muladd(1187353796428800, a2, B[m]))
      )
      U[m] = muladd(16380, a4, muladd(40840800, a2, a6))
    end
    matmuladd!(B, A6, U)
    matmul!(U, A, B)

    V = s > 0 ? fill!(A, 0) : zero(A)
    @inbounds for m = 1:N
      V[m, m] = 64764752532480000
    end
    @inbounds for m = 1:N*N
      a6 = A6[m]
      a4 = A4[m]
      a2 = A2[m]
      B[m] = muladd(182, a6, muladd(960960, a4, 1323241920 * a2))
      V[m] = muladd(
        670442572800,
        a6,
        muladd(129060195264000, a4, muladd(7771770303897600, a2, V[m]))
      )
    end
    matmuladd!(V, A6, B)

    @inbounds for m = 1:N*N
      u = U[m]
      v = V[m]
      U[m] = v + u
      V[m] = v - u
    end
    ldiv!(lu!(V), U)
    expA = U
    # expA = (V - U) \ (V + U)

    if s > 0            # squaring to reverse dividing by power of 2
      for t = 1:si
        matmul!(V, expA, expA)
        expA, V = V, expA
      end
      if Z !== expA
        copyto!(Z, expA)
        expA = Z
      end
    end
  end
  expA
end
expm_bad!(Z, A) = expm!(Z, A, mul!, (C, A, B) -> mul!(C, A, B, 1.0, 1.0))


const libMatrixDir = "/home/chriselrod/Documents/progwork/cxx/MatrixExp"
const libMatrixExp = joinpath(libMatrixDir, "buildgcc/libMatrixExp.so")
const libMatrixExpClang = joinpath(libMatrixDir, "buildclang/libMatrixExp.so")
for (lib, cc) in ((:libMatrixExp, :gcc), (:libMatrixExpClang, :clang))
  j = Symbol(cc, :expm!)
  @eval $j(B::Matrix{Float64}, A::Matrix{Float64}) = @ccall $lib.expmf64(
    B::Ptr{Float64},
    A::Ptr{Float64},
    size(A, 1)::Clong
  )::Nothing
  for n = 1:8
    sym = Symbol(:expmf64d, n)
    @eval $j(
      B::Matrix{ForwardDiff.Dual{T,Float64,$n}},
      A::Matrix{ForwardDiff.Dual{T,Float64,$n}}
    ) where {T} = @ccall $lib.$sym(
      B::Ptr{Float64},
      A::Ptr{Float64},
      size(A, 1)::Clong
    )::Nothing
    for i = 1:2
      sym = Symbol(:expmf64d, n, :d, i)
      @eval $j(
        B::Matrix{ForwardDiff.Dual{T1,ForwardDiff.Dual{T0,Float64,$n},$i}},
        A::Matrix{ForwardDiff.Dual{T1,ForwardDiff.Dual{T0,Float64,$n},$i}}
      ) where {T0,T1} = @ccall $lib.$sym(
        B::Ptr{Float64},
        A::Ptr{Float64},
        size(A, 1)::Clong
      )::Nothing
    end
  end
end

# macros are too awkward to work with, so we use a function
# mean times are much better for benchmarking than minimum
# whenever you have a function that allocates
function bmean(f)
  b = @benchmark $f()
  m = BenchmarkTools.mean(b)
  a = BenchmarkTools.allocs(m)
  println(
    "  ",
    (BenchmarkTools).prettytime((BenchmarkTools).time(m)),
    " (",
    a,
    " allocation",
    (a == 1 ? "" : "s"),
    ": ",
    (BenchmarkTools).prettymemory((BenchmarkTools).memory(m)),
    ")"
  )
end
#=
struct Closure{F,A}
  f::F
  a::A
end
Closure(f, a, b...) = Closure(f, (a, b...))
(c::Closure)() = c.f(c.a...)
=#
struct ForEach{A,B,F}
  f::F
  a::A
  b::B
end
ForEach(f, b) = ForEach(f, nothing, b)
(f::ForEach)() = foreach(Base.Fix1(f.f, f.a), f.b)
(f::ForEach{Nothing})() = foreach(f.f, f.b)

const BENCH_OPNORMS = (66.0, 33.0, 22.0, 11.0, 6.0, 3.0, 2.0, 0.5, 0.03, 0.001)

"""
Generates one random matrix per opnorm.
All generated matrices are scale multiples of one another.
This is meant to exercise all code paths in the `expm` function.
"""
function randmatrices(n)
  A = randn(n, n)
  op = opnorm(A, 1)
  map(BENCH_OPNORMS) do x
    (x / op) .* A
  end
end

d(x, n) = ForwardDiff.Dual(x, ntuple(_ -> randn(), n))
function dualify(A, n, j)
  if n > 0
    A = d.(A, n)
    if (j > 0)
      A = d.(A, j)
    end
  end
  A
end

duals(l, n, j = 0) = dualify(rand(l, l), n, j)
const BENCH_OPNORMS = (66.0, 33.0, 22.0, 11.0, 6.0, 3.0, 2.0, 0.5, 0.03, 0.001)

"""
Generates one random matrix per opnorm.
All generated matrices are scale multiples of one another.
This is meant to exercise all code paths in the `expm` function.
"""
function randmatrices(n)
  A = randn(n, n)
  op = opnorm(A, 1)
  map(BENCH_OPNORMS) do x
    (x / op) .* A
  end
end


As = map(x -> dualify(x, 7, 2), randmatrices(4));
B = similar(first(As));
@benchmark ForEach(expm!, $B, $As)()
@benchmark ForEach(expm_bad!, $B, $As)()

Also

using SimpleChains
function scmatmul!(C,A,B)
    z = SimpleChains.zero_offsets
    GC.@preserve C A B begin
       SimpleChains.matmul!(z(C),z(A),z(B),SimpleChains.False())
    end; nothing
end
expm_good!(Z,A) = expm!(Z,A,scmatmul!)

Here is what I get on ForwardDiff master:

julia> @benchmark ForEach(expm!, $B, $As)()
BenchmarkTools.Trial: 5453 samples with 1 evaluation.
 Range (min … max):  159.280 μs …   3.901 ms  ┊ GC (min … max): 0.00% … 89.32%
 Time  (median):     167.663 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   182.380 μs ± 127.957 μs  ┊ GC (mean ± σ):  6.66% ±  8.69%

  █▅▁                                                           ▁
  ███▇▄▃▅▃▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▅ █
  159 μs        Histogram: log(frequency) by time       1.08 ms <

 Memory estimate: 545.41 KiB, allocs estimate: 119.

julia> @benchmark ForEach(expm_bad!, $B, $As)()
BenchmarkTools.Trial: 4532 samples with 1 evaluation.
 Range (min … max):  185.609 μs …   1.409 ms  ┊ GC (min … max): 0.00% … 77.38%
 Time  (median):     197.894 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   219.516 μs ± 131.434 μs  ┊ GC (mean ± σ):  9.11% ± 11.97%

  █▇▂                                                           ▁
  ███▇▅▃▃▁▁▄▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▇██ █
  186 μs        Histogram: log(frequency) by time        998 μs <

 Memory estimate: 1.17 MiB, allocs estimate: 224.

Here is what I get on this branch:

julia> @benchmark ForEach(expm!, $B, $As)()
BenchmarkTools.Trial: 4756 samples with 1 evaluation.
 Range (min … max):  175.355 μs …   3.903 ms  ┊ GC (min … max): 0.00% … 89.63%
 Time  (median):     188.458 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   209.249 μs ± 145.212 μs  ┊ GC (mean ± σ):  6.58% ±  8.67%

  █▆▁  ▂▄                                                       ▁
  ███▇▇██▆▃▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▄▁▄▅ █
  175 μs        Histogram: log(frequency) by time       1.18 ms <

 Memory estimate: 545.41 KiB, allocs estimate: 119.

julia> @benchmark ForEach(expm_bad!, $B, $As)()
BenchmarkTools.Trial: 4200 samples with 1 evaluation.
 Range (min … max):  202.569 μs …   1.878 ms  ┊ GC (min … max): 0.00% … 84.78%
 Time  (median):     212.298 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   236.997 μs ± 130.260 μs  ┊ GC (mean ± σ):  8.17% ± 11.64%

  █▇▂                                                           ▁
  ███▇▅▅▅▁▃▁▄▄▃▅▅▅▅▇▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▄▅▇▇▇▇ █
  203 μs        Histogram: log(frequency) by time        978 μs <

 Memory estimate: 1.17 MiB, allocs estimate: 224.

And for good measure, here is a simple C++ implementation:

template <typename T>
constexpr void expm(MutSquarePtrMatrix<T> V, SquarePtrMatrix<T> A) {
  invariant(ptrdiff_t(V.numRow()), ptrdiff_t(A.numRow()));
  unsigned n = unsigned(A.numRow());
  auto nA = opnorm1(A);
  SquareMatrix<T, L> A2_{A * A}, U_{SquareDims{n}};
  MutSquarePtrMatrix<T> A2{A2_}, U{U_};
  unsigned int s = 0;
  if (nA <= 2.1) {
    containers::TinyVector<double, 5> p0, p1;
    if (nA > 0.95) {
      p0 = {1.0, 3960.0, 2162160.0, 302702400.0, 8821612800.0};
      p1 = {90.0, 110880.0, 3.027024e7, 2.0756736e9, 1.76432256e10};
    } else if (nA > 0.25) {
      p0 = {1.0, 1512.0, 277200.0, 8.64864e6};
      p1 = {56.0, 25200.0, 1.99584e6, 1.729728e7};
    } else if (nA > 0.015) {
      p0 = {1.0, 420.0, 15120.0};
      p1 = {30.0, 3360.0, 30240.0};
    } else {
      p0 = {1.0, 60.0};
      p1 = {12.0, 120.0};
    }
    evalpoly(V, A2, p0);
    U << A * V;
    evalpoly(V, A2, p1);
  } else {
    s = std::max(int(std::ceil(std::log2(nA / 5.4))), 0);
    double t = 1.0;
    if (s > 0) {
      t = 1.0 / std::exp2(s);
      A2 *= (t * t);
      if (s & 1) std::swap(U, V);
    }
    SquareMatrix<T, L> A4{A2 * A2}, A6{A2 * A4};

    V << A6 * (A6 + 16380 * A4 + 40840800 * A2) +
           (33522128640 * A6 + 10559470521600 * A4 + 1187353796428800 * A2) +
           32382376266240000 * I;
    U << A * V;
    if (s > 0) U *= t;
    V << A6 * (182 * A6 + 960960 * A4 + 1323241920 * A2) +
           (670442572800 * A6 + 129060195264000 * A4 + 7771770303897600 * A2) +
           64764752532480000 * I;
  }
  for (auto v = V.begin(), u = U.begin(), e = V.end(); v != e; ++v, ++u) {
    auto &&d = *v - *u;
    *v += *u;
    *u = d;
  }
  LU::ldiv(U, MutPtrMatrix<T>(V));
  for (; s--;) {
    U << V * V;
    std::swap(U, V);
  }
}

Yielding

julia> @benchmark ForEach(gccexpm!, $B, $As)()
BenchmarkTools.Trial: 8749 samples with 1 evaluation.
 Range (min … max):  110.272 μs … 370.745 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     112.821 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   113.465 μs ±   5.759 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

            ▄█▂  ▃▄                                           ▁ ▁
  ▇▁▁▁▁▄▁▅▅▄███▅▅█████▆▆▇▅▄▃▄▃▅▄▄▁▃▅▆▅▅▃▃▅▅▄▃▅▅▅▃▆▁▃▄▄▄▄▄▄▄▃▃▄█ █
  110 μs        Histogram: log(frequency) by time        124 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark ForEach(clangexpm!, $B, $As)()
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  73.307 μs … 231.474 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     73.503 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   73.789 μs ±   3.143 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▄██▇▆▆▆▅▃▁  ▁    ▂▂▃▂▂▁                                      ▂
  ██████████▆▆██▆▁███████▇▇▇█▇█▇▆▇▆▆▅▄▁▄▄▃▅▄▄▄▃▁▃▁▄▁▄▁▄▅▁▃▃▄▃▃ █
  73.3 μs       Histogram: log(frequency) by time      77.3 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.

This reverts commit 43ef860, reversing changes made to 01a056d.

chriselrod · 2023-08-14T19:10:09Z

Want to switch the base branch to release-0.10?
LGTM.

chriselrod · 2023-08-14T19:15:15Z

The primary risk of regressions to be mindful of is that the inliner assigns llvmcall a high cost, and thus we're at high risk of not inlining functions that should be inlined, and previously were (e.g. the retval).
Perhaps that should be fixed in base Julia/the compiler.

chriselrod · 2023-09-03T14:20:03Z

Anyone else want to review this?
Any objections?

KristofferC · 2023-09-04T08:06:49Z

Is the compilation time regression fixed (that was the cause of the previous revert IIRC)?

chriselrod · 2023-09-04T15:00:50Z

One benchmark, with the latest ForwardDiff:

 58.703225 seconds (120.99 M allocations: 7.162 GiB, 5.61% gc time, 82.85% compilation time: <1% of which was recompilation)
Multithreaded
  5.856279 seconds (28.28 M allocations: 1.502 GiB, 5.04% gc time, 34.00% compilation time)
  3.966313 seconds (24.36 M allocations: 1.269 GiB, 3.71% gc time)
  3.990597 seconds (24.36 M allocations: 1.269 GiB, 3.26% gc time)
  3.912937 seconds (24.36 M allocations: 1.269 GiB, 2.99% gc time)
  3.926629 seconds (24.36 M allocations: 1.269 GiB, 3.30% gc time)
Singlethreaded
 10.105259 seconds (24.41 M allocations: 1.271 GiB, 2.44% gc time)
 10.108382 seconds (24.41 M allocations: 1.271 GiB, 2.42% gc time)
 10.092458 seconds (24.41 M allocations: 1.271 GiB, 2.38% gc time)
 10.090259 seconds (24.41 M allocations: 1.271 GiB, 2.53% gc time)
 10.088804 seconds (24.41 M allocations: 1.271 GiB, 2.44% gc time)
226.945 secs

vs this PR:

 56.318331 seconds (132.22 M allocations: 7.687 GiB, 5.07% gc time, 85.20% compilation time: <1% of which was recompilation)
Multithreaded
  5.237769 seconds (28.39 M allocations: 1.513 GiB, 4.91% gc time, 36.93% compilation time)
  3.390317 seconds (24.46 M allocations: 1.279 GiB, 3.85% gc time)
  3.373176 seconds (24.46 M allocations: 1.279 GiB, 3.36% gc time)
  3.369030 seconds (24.46 M allocations: 1.279 GiB, 3.30% gc time)
  3.361547 seconds (24.46 M allocations: 1.279 GiB, 3.24% gc time)
Singlethreaded
  8.358523 seconds (24.46 M allocations: 1.278 GiB, 2.30% gc time)
  8.345199 seconds (24.46 M allocations: 1.278 GiB, 2.15% gc time)
  8.333443 seconds (24.46 M allocations: 1.278 GiB, 2.11% gc time)
  8.328236 seconds (24.46 M allocations: 1.278 GiB, 2.05% gc time)
  8.337621 seconds (24.46 M allocations: 1.278 GiB, 2.06% gc time)
215.426 secs

The first time reported above includes compilation time, while the final "secs" is total time of the script, reported from the command line.

It didn't seem to change much, but we got a decent improvement in runtime.
This is the same problem that I reported the compile time regression on earlier.

Unfortunately, I did try another problem where I did see a hefty compile time regression.
Latest release:

294.641244 seconds (1.24 G allocations: 310.848 GiB, 10.16% gc time, 7.09% compilation time)
1196.506264 seconds (361.53 M allocations: 54.645 GiB, 0.57% gc time, 67.88% compilation time)
Multithreaded
 90.339731 seconds (1.28 G allocations: 327.525 GiB, 15.69% gc time, 2.19% compilation time)
 98.263920 seconds (59.49 M allocations: 40.988 GiB, 1.20% gc time, 0.80% compilation time)
 88.254495 seconds (1.27 G allocations: 327.282 GiB, 15.96% gc time)
 96.993289 seconds (58.71 M allocations: 40.937 GiB, 1.33% gc time)
Singlethreaded
262.654300 seconds (1.20 G allocations: 308.277 GiB, 5.76% gc time)
383.569549 seconds (58.68 M allocations: 40.927 GiB, 0.36% gc time)
1475.777 secs

vs this PR:

335.140249 seconds (1.65 G allocations: 427.614 GiB, 7.81% gc time, 6.51% compilation time)
1759.027660 seconds (848.52 M allocations: 74.163 GiB, 0.73% gc time, 88.33% compilation time)
Multithreaded
 77.094182 seconds (1.20 G allocations: 308.489 GiB, 17.78% gc time, 2.49% compilation time)
 61.022739 seconds (70.76 M allocations: 49.174 GiB, 2.55% gc time, 1.29% compilation time)
 72.532341 seconds (1.20 G allocations: 308.245 GiB, 18.73% gc time)
 59.765939 seconds (69.98 M allocations: 49.122 GiB, 1.83% gc time)
Singlethreaded
322.542648 seconds (1.60 G allocations: 424.651 GiB, 5.87% gc time)
203.501279 seconds (61.39 M allocations: 43.153 GiB, 0.66% gc time)
2959.059 secs

The first two times here include compile times.
There is a large improvement in runtime, but compile time unfortunately did increase.

Note that this above benchmark is alternating between using Arrays and StaticArray.SArrays. SArrays improve runtime slightly, but dramatically increase compile times (from around 300s to >1000s).

I'm still a fan of this PR, but it'd be great if someone from the compiler team could look into improving compile times of SIMD.jl heavy code.
Ideally, we'd have a lot of basic operations supported through tfuncs rather than needing to go through llvmcall.
Then, needing to add @inline everywhere would also be less important, as these functions could have their cost accurately reflected.

mcabbott mentioned this pull request May 6, 2022

Change == to ignore measure-zero branches #481

Merged

KristofferC added 2 commits August 14, 2023 19:59

Revert "Merge pull request #569 from JuliaDiff/kc/revert_simd"

a47b126

This reverts commit 43ef860, reversing changes made to 01a056d.

explicitly inline dual_definition_retval

39bbd95

KristofferC force-pushed the kc/simd branch from e64760e to 39bbd95 Compare August 14, 2023 18:01

chriselrod mentioned this pull request Aug 14, 2023

Merge master, inline retval #655

Closed

KristofferC changed the base branch from master to release-0.10 August 14, 2023 19:53

chriselrod approved these changes Sep 3, 2023

View reviewed changes

chriselrod mentioned this pull request Dec 11, 2023

[WIP] Stack allocate some genericmemory JuliaLang/julia#52382

Draft

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

WIP: use SIMD.jl for partials #570

WIP: use SIMD.jl for partials #570

KristofferC commented Dec 13, 2021

codecov-commenter commented Dec 13, 2021 •

edited by codecov bot

Loading

chriselrod commented Jul 4, 2023

chriselrod commented Jul 4, 2023 •

edited

Loading

chriselrod commented Aug 14, 2023

chriselrod commented Aug 14, 2023

chriselrod commented Sep 3, 2023

KristofferC commented Sep 4, 2023

chriselrod commented Sep 4, 2023

WIP: use SIMD.jl for partials #570

Are you sure you want to change the base?

WIP: use SIMD.jl for partials #570

Conversation

KristofferC commented Dec 13, 2021

codecov-commenter commented Dec 13, 2021 • edited by codecov bot Loading

Codecov Report

chriselrod commented Jul 4, 2023

chriselrod commented Jul 4, 2023 • edited Loading

chriselrod commented Aug 14, 2023

chriselrod commented Aug 14, 2023

chriselrod commented Sep 3, 2023

KristofferC commented Sep 4, 2023

chriselrod commented Sep 4, 2023

codecov-commenter commented Dec 13, 2021 •

edited by codecov bot

Loading

chriselrod commented Jul 4, 2023 •

edited

Loading