-
Notifications
You must be signed in to change notification settings - Fork 147
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: use SIMD.jl for partials #570
base: release-0.10
Are you sure you want to change the base?
Conversation
Codecov ReportPatch coverage:
Additional details and impacted files@@ Coverage Diff @@
## master #570 +/- ##
==========================================
- Coverage 86.71% 85.38% -1.33%
==========================================
Files 9 9
Lines 903 869 -34
==========================================
- Hits 783 742 -41
- Misses 120 127 +7
☔ View full report in Codecov by Sentry. |
PR + merged master locally: * time: 4.9393510818481445
41.416090 seconds (90.34 M allocations: 5.079 GiB, 4.08% gc time, 162.06% compilation time: <1% of which was recompilation)
4.908872 seconds (28.16 M allocations: 1.333 GiB, 2.66% gc time, 0.20% compilation time)
4.811479 seconds (28.15 M allocations: 1.333 GiB, 2.55% gc time)
4.728634 seconds (28.15 M allocations: 1.333 GiB, 2.41% gc time)
90.008 secs Master: * time: 4.6424009799957275
36.167679 seconds (81.11 M allocations: 4.641 GiB, 4.30% gc time, 176.86% compilation time: <1% of which was recompilation)
4.709468 seconds (28.18 M allocations: 1.329 GiB, 2.68% gc time, 0.21% compilation time)
4.612684 seconds (28.17 M allocations: 1.329 GiB, 2.28% gc time)
4.640914 seconds (28.17 M allocations: 1.329 GiB, 2.46% gc time)
83.842 secs I do think we should try to get some form of this in, but I don't see an improvement in the current form. |
Here's a simpler benchmark: using ForwardDiff, BenchmarkTools
function _matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple{1}) where {T}
M = size(A, 1)
te = T(last(t))
@inbounds for n = 1:M, m = 1:M
B[m, n] = ifelse(m == n, te, zero(te))
end
@inbounds for n = 1:M, k = 1:M, m = 1:M
B[m, n] = muladd(A[m, k], D[k, n], B[m, n])
end
return B
end
function _matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple) where {T}
M = size(A, 1)
te = T(last(t))
@inbounds for n = 1:M, m = 1:M
C[m, n] = ifelse(m == n, te, zero(te))
end
@inbounds for n = 1:M, k = 1:M, m = 1:M
C[m, n] = muladd(A[m, k], D[k, n], C[m, n])
end
_matevalpoly!(B, D, C, A, Base.front(t))
end
function matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple) where {T}
t1 = Base.front(t)
te = T(last(t))
tp = T(last(t1))
@inbounds for j in axes(A, 2), i in axes(A, 1)
D[i, j] = muladd(A[i, j], te, ifelse(i == j, tp, zero(tp)))
end
_matevalpoly!(B, C, D, A, Base.front(t1))
end
function matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple{2}) where {T}
t1 = Base.front(t)
te = T(last(t))
tp = T(last(t1))
@inbounds for j in axes(A, 2), i in axes(A, 1)
B[i, j] = muladd(A[i, j], te, ifelse(i == j, tp, zero(tp)))
end
return B
end
naive_matmul!(C, A, B) =
for n in axes(C, 2), m in axes(C, 1)
Cmn = zero(eltype(C))
for k in axes(A, 2)
Cmn += A[m, k] * B[k, n]
end
C[m, n] = Cmn
end
naive_matmuladd!(C, A, B) =
for n in axes(C, 2), m in axes(C, 1)
Cmn = zero(eltype(C))
for k in axes(A, 2)
Cmn += A[m, k] * B[k, n]
end
C[m, n] += Cmn
end
function expm!(
Z::AbstractMatrix,
A::AbstractMatrix,
matmul! = naive_matmul!,
matmuladd! = naive_matmuladd!
)
# omitted: matrix balancing, i.e., LAPACK.gebal!
# nA = maximum(sum(abs.(A); dims=Val(1))) # marginally more performant than norm(A, 1)
nA = opnorm(A, 1)
N = LinearAlgebra.checksquare(A)
# B and C are temporaries
## For sufficiently small nA, use lower order Padé-Approximations
if nA <= 2.1
U = Z
V = similar(A)
A2 = A * A
if nA <= 0.015
matevalpoly!(V, nothing, nothing, A2, (60, 1))
matmul!(U, A, V)
matevalpoly!(V, nothing, nothing, A2, (120, 12))
else
B = similar(A)
if nA <= 0.25
matevalpoly!(V, nothing, U, A2, (15120, 420, 1))
matmul!(U, A, V)
matevalpoly!(V, nothing, B, A2, (30240, 3360, 30))
else
C = similar(A)
if nA <= 0.95
matevalpoly!(V, C, U, A2, (8648640, 277200, 1512, 1))
matmul!(U, A, V)
matevalpoly!(V, B, C, A2, (17297280, 1995840, 25200, 56))
else
matevalpoly!(V, C, U, A2, (8821612800, 302702400, 2162160, 3960, 1))
matmul!(U, A, V)
matevalpoly!(
V,
B,
C,
A2,
(17643225600, 2075673600, 30270240, 110880, 90)
)
end
end
end
@inbounds for m = 1:N*N
u = U[m]
v = V[m]
U[m] = v + u
V[m] = v - u
end
ldiv!(lu!(V), U)
expA = U
# expA = (V - U) \ (V + U)
else
s = log2(nA / 5.4) # power of 2 later reversed by squaring
if s > 0
si = ceil(Int, s)
A = A / exp2(si)
end
A2 = A * A
A4 = A2 * A2
A6 = A2 * A4
U = Z
B = zero(A)
@inbounds for m = 1:N
B[m, m] = 32382376266240000
end
@inbounds for m = 1:N*N
a6 = A6[m]
a4 = A4[m]
a2 = A2[m]
B[m] = muladd(
33522128640,
a6,
muladd(10559470521600, a4, muladd(1187353796428800, a2, B[m]))
)
U[m] = muladd(16380, a4, muladd(40840800, a2, a6))
end
matmuladd!(B, A6, U)
matmul!(U, A, B)
V = s > 0 ? fill!(A, 0) : zero(A)
@inbounds for m = 1:N
V[m, m] = 64764752532480000
end
@inbounds for m = 1:N*N
a6 = A6[m]
a4 = A4[m]
a2 = A2[m]
B[m] = muladd(182, a6, muladd(960960, a4, 1323241920 * a2))
V[m] = muladd(
670442572800,
a6,
muladd(129060195264000, a4, muladd(7771770303897600, a2, V[m]))
)
end
matmuladd!(V, A6, B)
@inbounds for m = 1:N*N
u = U[m]
v = V[m]
U[m] = v + u
V[m] = v - u
end
ldiv!(lu!(V), U)
expA = U
# expA = (V - U) \ (V + U)
if s > 0 # squaring to reverse dividing by power of 2
for t = 1:si
matmul!(V, expA, expA)
expA, V = V, expA
end
if Z !== expA
copyto!(Z, expA)
expA = Z
end
end
end
expA
end
expm_bad!(Z, A) = expm!(Z, A, mul!, (C, A, B) -> mul!(C, A, B, 1.0, 1.0))
const libMatrixDir = "/home/chriselrod/Documents/progwork/cxx/MatrixExp"
const libMatrixExp = joinpath(libMatrixDir, "buildgcc/libMatrixExp.so")
const libMatrixExpClang = joinpath(libMatrixDir, "buildclang/libMatrixExp.so")
for (lib, cc) in ((:libMatrixExp, :gcc), (:libMatrixExpClang, :clang))
j = Symbol(cc, :expm!)
@eval $j(B::Matrix{Float64}, A::Matrix{Float64}) = @ccall $lib.expmf64(
B::Ptr{Float64},
A::Ptr{Float64},
size(A, 1)::Clong
)::Nothing
for n = 1:8
sym = Symbol(:expmf64d, n)
@eval $j(
B::Matrix{ForwardDiff.Dual{T,Float64,$n}},
A::Matrix{ForwardDiff.Dual{T,Float64,$n}}
) where {T} = @ccall $lib.$sym(
B::Ptr{Float64},
A::Ptr{Float64},
size(A, 1)::Clong
)::Nothing
for i = 1:2
sym = Symbol(:expmf64d, n, :d, i)
@eval $j(
B::Matrix{ForwardDiff.Dual{T1,ForwardDiff.Dual{T0,Float64,$n},$i}},
A::Matrix{ForwardDiff.Dual{T1,ForwardDiff.Dual{T0,Float64,$n},$i}}
) where {T0,T1} = @ccall $lib.$sym(
B::Ptr{Float64},
A::Ptr{Float64},
size(A, 1)::Clong
)::Nothing
end
end
end
# macros are too awkward to work with, so we use a function
# mean times are much better for benchmarking than minimum
# whenever you have a function that allocates
function bmean(f)
b = @benchmark $f()
m = BenchmarkTools.mean(b)
a = BenchmarkTools.allocs(m)
println(
" ",
(BenchmarkTools).prettytime((BenchmarkTools).time(m)),
" (",
a,
" allocation",
(a == 1 ? "" : "s"),
": ",
(BenchmarkTools).prettymemory((BenchmarkTools).memory(m)),
")"
)
end
#=
struct Closure{F,A}
f::F
a::A
end
Closure(f, a, b...) = Closure(f, (a, b...))
(c::Closure)() = c.f(c.a...)
=#
struct ForEach{A,B,F}
f::F
a::A
b::B
end
ForEach(f, b) = ForEach(f, nothing, b)
(f::ForEach)() = foreach(Base.Fix1(f.f, f.a), f.b)
(f::ForEach{Nothing})() = foreach(f.f, f.b)
const BENCH_OPNORMS = (66.0, 33.0, 22.0, 11.0, 6.0, 3.0, 2.0, 0.5, 0.03, 0.001)
"""
Generates one random matrix per opnorm.
All generated matrices are scale multiples of one another.
This is meant to exercise all code paths in the `expm` function.
"""
function randmatrices(n)
A = randn(n, n)
op = opnorm(A, 1)
map(BENCH_OPNORMS) do x
(x / op) .* A
end
end
d(x, n) = ForwardDiff.Dual(x, ntuple(_ -> randn(), n))
function dualify(A, n, j)
if n > 0
A = d.(A, n)
if (j > 0)
A = d.(A, j)
end
end
A
end
duals(l, n, j = 0) = dualify(rand(l, l), n, j)
const BENCH_OPNORMS = (66.0, 33.0, 22.0, 11.0, 6.0, 3.0, 2.0, 0.5, 0.03, 0.001)
"""
Generates one random matrix per opnorm.
All generated matrices are scale multiples of one another.
This is meant to exercise all code paths in the `expm` function.
"""
function randmatrices(n)
A = randn(n, n)
op = opnorm(A, 1)
map(BENCH_OPNORMS) do x
(x / op) .* A
end
end
As = map(x -> dualify(x, 7, 2), randmatrices(4));
B = similar(first(As));
@benchmark ForEach(expm!, $B, $As)()
@benchmark ForEach(expm_bad!, $B, $As)() Also using SimpleChains
function scmatmul!(C,A,B)
z = SimpleChains.zero_offsets
GC.@preserve C A B begin
SimpleChains.matmul!(z(C),z(A),z(B),SimpleChains.False())
end; nothing
end
expm_good!(Z,A) = expm!(Z,A,scmatmul!) Here is what I get on ForwardDiff master: julia> @benchmark ForEach(expm!, $B, $As)()
BenchmarkTools.Trial: 5453 samples with 1 evaluation.
Range (min … max): 159.280 μs … 3.901 ms ┊ GC (min … max): 0.00% … 89.32%
Time (median): 167.663 μs ┊ GC (median): 0.00%
Time (mean ± σ): 182.380 μs ± 127.957 μs ┊ GC (mean ± σ): 6.66% ± 8.69%
█▅▁ ▁
███▇▄▃▅▃▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▅ █
159 μs Histogram: log(frequency) by time 1.08 ms <
Memory estimate: 545.41 KiB, allocs estimate: 119.
julia> @benchmark ForEach(expm_bad!, $B, $As)()
BenchmarkTools.Trial: 4532 samples with 1 evaluation.
Range (min … max): 185.609 μs … 1.409 ms ┊ GC (min … max): 0.00% … 77.38%
Time (median): 197.894 μs ┊ GC (median): 0.00%
Time (mean ± σ): 219.516 μs ± 131.434 μs ┊ GC (mean ± σ): 9.11% ± 11.97%
█▇▂ ▁
███▇▅▃▃▁▁▄▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▇██ █
186 μs Histogram: log(frequency) by time 998 μs <
Memory estimate: 1.17 MiB, allocs estimate: 224. Here is what I get on this branch: julia> @benchmark ForEach(expm!, $B, $As)()
BenchmarkTools.Trial: 4756 samples with 1 evaluation.
Range (min … max): 175.355 μs … 3.903 ms ┊ GC (min … max): 0.00% … 89.63%
Time (median): 188.458 μs ┊ GC (median): 0.00%
Time (mean ± σ): 209.249 μs ± 145.212 μs ┊ GC (mean ± σ): 6.58% ± 8.67%
█▆▁ ▂▄ ▁
███▇▇██▆▃▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▄▁▄▅ █
175 μs Histogram: log(frequency) by time 1.18 ms <
Memory estimate: 545.41 KiB, allocs estimate: 119.
julia> @benchmark ForEach(expm_bad!, $B, $As)()
BenchmarkTools.Trial: 4200 samples with 1 evaluation.
Range (min … max): 202.569 μs … 1.878 ms ┊ GC (min … max): 0.00% … 84.78%
Time (median): 212.298 μs ┊ GC (median): 0.00%
Time (mean ± σ): 236.997 μs ± 130.260 μs ┊ GC (mean ± σ): 8.17% ± 11.64%
█▇▂ ▁
███▇▅▅▅▁▃▁▄▄▃▅▅▅▅▇▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▄▅▇▇▇▇ █
203 μs Histogram: log(frequency) by time 978 μs <
Memory estimate: 1.17 MiB, allocs estimate: 224. And for good measure, here is a simple C++ implementation: template <typename T>
constexpr void expm(MutSquarePtrMatrix<T> V, SquarePtrMatrix<T> A) {
invariant(ptrdiff_t(V.numRow()), ptrdiff_t(A.numRow()));
unsigned n = unsigned(A.numRow());
auto nA = opnorm1(A);
SquareMatrix<T, L> A2_{A * A}, U_{SquareDims{n}};
MutSquarePtrMatrix<T> A2{A2_}, U{U_};
unsigned int s = 0;
if (nA <= 2.1) {
containers::TinyVector<double, 5> p0, p1;
if (nA > 0.95) {
p0 = {1.0, 3960.0, 2162160.0, 302702400.0, 8821612800.0};
p1 = {90.0, 110880.0, 3.027024e7, 2.0756736e9, 1.76432256e10};
} else if (nA > 0.25) {
p0 = {1.0, 1512.0, 277200.0, 8.64864e6};
p1 = {56.0, 25200.0, 1.99584e6, 1.729728e7};
} else if (nA > 0.015) {
p0 = {1.0, 420.0, 15120.0};
p1 = {30.0, 3360.0, 30240.0};
} else {
p0 = {1.0, 60.0};
p1 = {12.0, 120.0};
}
evalpoly(V, A2, p0);
U << A * V;
evalpoly(V, A2, p1);
} else {
s = std::max(int(std::ceil(std::log2(nA / 5.4))), 0);
double t = 1.0;
if (s > 0) {
t = 1.0 / std::exp2(s);
A2 *= (t * t);
if (s & 1) std::swap(U, V);
}
SquareMatrix<T, L> A4{A2 * A2}, A6{A2 * A4};
V << A6 * (A6 + 16380 * A4 + 40840800 * A2) +
(33522128640 * A6 + 10559470521600 * A4 + 1187353796428800 * A2) +
32382376266240000 * I;
U << A * V;
if (s > 0) U *= t;
V << A6 * (182 * A6 + 960960 * A4 + 1323241920 * A2) +
(670442572800 * A6 + 129060195264000 * A4 + 7771770303897600 * A2) +
64764752532480000 * I;
}
for (auto v = V.begin(), u = U.begin(), e = V.end(); v != e; ++v, ++u) {
auto &&d = *v - *u;
*v += *u;
*u = d;
}
LU::ldiv(U, MutPtrMatrix<T>(V));
for (; s--;) {
U << V * V;
std::swap(U, V);
}
} Yielding julia> @benchmark ForEach(gccexpm!, $B, $As)()
BenchmarkTools.Trial: 8749 samples with 1 evaluation.
Range (min … max): 110.272 μs … 370.745 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 112.821 μs ┊ GC (median): 0.00%
Time (mean ± σ): 113.465 μs ± 5.759 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▄█▂ ▃▄ ▁ ▁
▇▁▁▁▁▄▁▅▅▄███▅▅█████▆▆▇▅▄▃▄▃▅▄▄▁▃▅▆▅▅▃▃▅▅▄▃▅▅▅▃▆▁▃▄▄▄▄▄▄▄▃▃▄█ █
110 μs Histogram: log(frequency) by time 124 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark ForEach(clangexpm!, $B, $As)()
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 73.307 μs … 231.474 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 73.503 μs ┊ GC (median): 0.00%
Time (mean ± σ): 73.789 μs ± 3.143 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▄██▇▆▆▆▅▃▁ ▁ ▂▂▃▂▂▁ ▂
██████████▆▆██▆▁███████▇▇▇█▇█▇▆▇▆▆▅▄▁▄▄▃▅▄▄▄▃▁▃▁▄▁▄▁▄▅▁▃▃▄▃▃ █
73.3 μs Histogram: log(frequency) by time 77.3 μs <
Memory estimate: 0 bytes, allocs estimate: 0. |
Want to switch the base branch to |
The primary risk of regressions to be mindful of is that the inliner assigns |
Anyone else want to review this? |
Is the compilation time regression fixed (that was the cause of the previous revert IIRC)? |
One benchmark, with the latest ForwardDiff: 58.703225 seconds (120.99 M allocations: 7.162 GiB, 5.61% gc time, 82.85% compilation time: <1% of which was recompilation)
Multithreaded
5.856279 seconds (28.28 M allocations: 1.502 GiB, 5.04% gc time, 34.00% compilation time)
3.966313 seconds (24.36 M allocations: 1.269 GiB, 3.71% gc time)
3.990597 seconds (24.36 M allocations: 1.269 GiB, 3.26% gc time)
3.912937 seconds (24.36 M allocations: 1.269 GiB, 2.99% gc time)
3.926629 seconds (24.36 M allocations: 1.269 GiB, 3.30% gc time)
Singlethreaded
10.105259 seconds (24.41 M allocations: 1.271 GiB, 2.44% gc time)
10.108382 seconds (24.41 M allocations: 1.271 GiB, 2.42% gc time)
10.092458 seconds (24.41 M allocations: 1.271 GiB, 2.38% gc time)
10.090259 seconds (24.41 M allocations: 1.271 GiB, 2.53% gc time)
10.088804 seconds (24.41 M allocations: 1.271 GiB, 2.44% gc time)
226.945 secs vs this PR: 56.318331 seconds (132.22 M allocations: 7.687 GiB, 5.07% gc time, 85.20% compilation time: <1% of which was recompilation)
Multithreaded
5.237769 seconds (28.39 M allocations: 1.513 GiB, 4.91% gc time, 36.93% compilation time)
3.390317 seconds (24.46 M allocations: 1.279 GiB, 3.85% gc time)
3.373176 seconds (24.46 M allocations: 1.279 GiB, 3.36% gc time)
3.369030 seconds (24.46 M allocations: 1.279 GiB, 3.30% gc time)
3.361547 seconds (24.46 M allocations: 1.279 GiB, 3.24% gc time)
Singlethreaded
8.358523 seconds (24.46 M allocations: 1.278 GiB, 2.30% gc time)
8.345199 seconds (24.46 M allocations: 1.278 GiB, 2.15% gc time)
8.333443 seconds (24.46 M allocations: 1.278 GiB, 2.11% gc time)
8.328236 seconds (24.46 M allocations: 1.278 GiB, 2.05% gc time)
8.337621 seconds (24.46 M allocations: 1.278 GiB, 2.06% gc time)
215.426 secs The first time reported above includes compilation time, while the final "secs" is total time of the script, reported from the command line. It didn't seem to change much, but we got a decent improvement in runtime. Unfortunately, I did try another problem where I did see a hefty compile time regression. 294.641244 seconds (1.24 G allocations: 310.848 GiB, 10.16% gc time, 7.09% compilation time)
1196.506264 seconds (361.53 M allocations: 54.645 GiB, 0.57% gc time, 67.88% compilation time)
Multithreaded
90.339731 seconds (1.28 G allocations: 327.525 GiB, 15.69% gc time, 2.19% compilation time)
98.263920 seconds (59.49 M allocations: 40.988 GiB, 1.20% gc time, 0.80% compilation time)
88.254495 seconds (1.27 G allocations: 327.282 GiB, 15.96% gc time)
96.993289 seconds (58.71 M allocations: 40.937 GiB, 1.33% gc time)
Singlethreaded
262.654300 seconds (1.20 G allocations: 308.277 GiB, 5.76% gc time)
383.569549 seconds (58.68 M allocations: 40.927 GiB, 0.36% gc time)
1475.777 secs vs this PR: 335.140249 seconds (1.65 G allocations: 427.614 GiB, 7.81% gc time, 6.51% compilation time)
1759.027660 seconds (848.52 M allocations: 74.163 GiB, 0.73% gc time, 88.33% compilation time)
Multithreaded
77.094182 seconds (1.20 G allocations: 308.489 GiB, 17.78% gc time, 2.49% compilation time)
61.022739 seconds (70.76 M allocations: 49.174 GiB, 2.55% gc time, 1.29% compilation time)
72.532341 seconds (1.20 G allocations: 308.245 GiB, 18.73% gc time)
59.765939 seconds (69.98 M allocations: 49.122 GiB, 1.83% gc time)
Singlethreaded
322.542648 seconds (1.60 G allocations: 424.651 GiB, 5.87% gc time)
203.501279 seconds (61.39 M allocations: 43.153 GiB, 0.66% gc time)
2959.059 secs The first two times here include compile times. Note that this above benchmark is alternating between using I'm still a fan of this PR, but it'd be great if someone from the compiler team could look into improving compile times of |
This needs quite well testing for downstream users to ensure regressions haven't crept in due to the implementation in SIMD.jl.