Skip to content

Conversation

@matteosecli
Copy link

  • Expand LinearAlgebra.eigen() to handle non-symmetric matrices via recent Xgeev
  • Add LinearAlgebra.eigvals() and LinearAlgebra.eigvecs()

@github-actions
Copy link
Contributor

github-actions bot commented May 26, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/test/libraries/cusolver/dense.jl b/test/libraries/cusolver/dense.jl
index fa5dfddea..d3e7df645 100644
--- a/test/libraries/cusolver/dense.jl
+++ b/test/libraries/cusolver/dense.jl
@@ -334,31 +334,31 @@ sorteig!(λ::AbstractVector, sortby::Union{Function, Nothing} = eigsortby) = sor
         @testset "geev!" begin
             local d_W, d_V
 
-            A              = rand(elty,m,m)
-            d_A            = CuArray(A)
-            Eig            = eigen(A)
-            d_eig          = eigen(d_A)
+            A = rand(elty, m, m)
+            d_A = CuArray(A)
+            Eig = eigen(A)
+            d_eig = eigen(d_A)
             sorteig!(d_eig.values, d_eig.vectors)
             @test Eig.values ≈ collect(d_eig.values)
-            h_V            = collect(d_eig.vectors)
-            h_V⁻¹          = inv(h_V)
-            @test abs.(h_V⁻¹*Eig.vectors) ≈ I
+            h_V = collect(d_eig.vectors)
+            h_V⁻¹ = inv(h_V)
+            @test abs.(h_V⁻¹ * Eig.vectors) ≈ I
 
-            A              = rand(elty,m,m)
-            d_A            = CuArray(A)
-            W              = eigvals(A)
-            d_W            = eigvals(d_A)
+            A = rand(elty, m, m)
+            d_A = CuArray(A)
+            W = eigvals(A)
+            d_W = eigvals(d_A)
             sorteig!(d_W)
-            @test W        ≈ collect(d_W)
+            @test W ≈ collect(d_W)
 
-            A              = rand(elty,m,m)
-            d_A            = CuArray(A)
-            V              = eigvecs(A)
-            d_W            = eigvals(d_A)
-            d_V            = eigvecs(d_A)
+            A = rand(elty, m, m)
+            d_A = CuArray(A)
+            V = eigvecs(A)
+            d_W = eigvals(d_A)
+            d_V = eigvecs(d_A)
             sorteig!(d_W, d_V)
-            V⁻¹            = inv(V)
-            @test abs.(V⁻¹*collect(d_V)) ≈ I
+            V⁻¹ = inv(V)
+            @test abs.(V⁻¹ * collect(d_V)) ≈ I
         end
     end
 
@@ -417,38 +417,38 @@ sorteig!(λ::AbstractVector, sortby::Union{Function, Nothing} = eigsortby) = sor
             @test abs.(Eig.vectors'*h_V) ≈ I
         end
 
-        A              = rand(elty,m,m)
-        A             += A'
-        d_A            = CuArray(A)
-        W              = eigvals(LinearAlgebra.Hermitian(A))
-        d_W            = eigvals(d_A)
+        A = rand(elty, m, m)
+        A += A'
+        d_A = CuArray(A)
+        W = eigvals(LinearAlgebra.Hermitian(A))
+        d_W = eigvals(d_A)
         sorteig!(d_W)
-        @test W        ≈ collect(d_W)
-        d_W            = eigvals(LinearAlgebra.Hermitian(d_A))
-        @test W        ≈ collect(d_W)
+        @test W ≈ collect(d_W)
+        d_W = eigvals(LinearAlgebra.Hermitian(d_A))
+        @test W ≈ collect(d_W)
         if elty <: Real
-            W              = eigvals(LinearAlgebra.Symmetric(A))
-            d_W            = eigvals(LinearAlgebra.Symmetric(d_A))
-            @test W        ≈ collect(d_W)
+            W = eigvals(LinearAlgebra.Symmetric(A))
+            d_W = eigvals(LinearAlgebra.Symmetric(d_A))
+            @test W ≈ collect(d_W)
         end
 
-        A              = rand(elty,m,m)
-        A             += A'
-        d_A            = CuArray(A)
-        V              = eigvecs(LinearAlgebra.Hermitian(A))
-        d_W            = eigvals(d_A)
-        d_V            = eigvecs(d_A)
+        A = rand(elty, m, m)
+        A += A'
+        d_A = CuArray(A)
+        V = eigvecs(LinearAlgebra.Hermitian(A))
+        d_W = eigvals(d_A)
+        d_V = eigvecs(d_A)
         sorteig!(d_W, d_V)
-        h_V            = collect(d_V)
-        @test abs.(V'*h_V) ≈ I
-        d_V            = eigvecs(LinearAlgebra.Hermitian(d_A))
-        h_V            = collect(d_V)
-        @test abs.(V'*h_V) ≈ I
+        h_V = collect(d_V)
+        @test abs.(V' * h_V) ≈ I
+        d_V = eigvecs(LinearAlgebra.Hermitian(d_A))
+        h_V = collect(d_V)
+        @test abs.(V' * h_V) ≈ I
         if elty <: Real
-            V              = eigvecs(LinearAlgebra.Symmetric(A))
-            d_V            = eigvecs(LinearAlgebra.Symmetric(d_A))
-            h_V            = collect(d_V)
-            @test abs.(V'*h_V) ≈ I
+            V = eigvecs(LinearAlgebra.Symmetric(A))
+            d_V = eigvecs(LinearAlgebra.Symmetric(d_A))
+            h_V = collect(d_V)
+            @test abs.(V' * h_V) ≈ I
         end
     end
 

@maleadt
Copy link
Member

maleadt commented May 27, 2025

Thanks. Can you add some tests?

@maleadt maleadt added enhancement New feature or request needs tests Tests are requested. cuda libraries Stuff about CUDA library wrappers. labels May 27, 2025
@maleadt maleadt requested a review from kshyatt May 27, 2025 11:21
Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Benchmark suite Current: b06923e Previous: bb7bc44 Ratio
latency/precompile 56677483179 ns 57053937298.5 ns 0.99
latency/ttfp 8177801447 ns 8313449149.5 ns 0.98
latency/import 4498929499 ns 4495227234 ns 1.00
integration/volumerhs 9610323 ns 9609731 ns 1.00
integration/byval/slices=1 147105 ns 146771 ns 1.00
integration/byval/slices=3 426475 ns 425865 ns 1.00
integration/byval/reference 145257 ns 145205 ns 1.00
integration/byval/slices=2 286671 ns 286334 ns 1.00
integration/cudadevrt 103721 ns 103543 ns 1.00
kernel/indexing 14490 ns 14174 ns 1.02
kernel/indexing_checked 15093 ns 14879 ns 1.01
kernel/occupancy 668.3459119496855 ns 670.9177215189874 ns 1.00
kernel/launch 2303.166666666667 ns 2111.8 ns 1.09
kernel/rand 15431 ns 15193 ns 1.02
array/reverse/1d 19941.5 ns 20290 ns 0.98
array/reverse/2dL_inplace 67112 ns 66861 ns 1.00
array/reverse/1dL 70198.5 ns 70337 ns 1.00
array/reverse/2d 21897.5 ns 22210 ns 0.99
array/reverse/1d_inplace 9748 ns 9641 ns 1.01
array/reverse/2d_inplace 13474 ns 13386 ns 1.01
array/reverse/2dL 74161 ns 74283 ns 1.00
array/reverse/1dL_inplace 66901 ns 66783 ns 1.00
array/copy 20796 ns 20919 ns 0.99
array/iteration/findall/int 158798 ns 158654 ns 1.00
array/iteration/findall/bool 140599 ns 140615 ns 1.00
array/iteration/findfirst/int 161824 ns 161604.5 ns 1.00
array/iteration/findfirst/bool 162598 ns 162505 ns 1.00
array/iteration/scalar 75084 ns 74607 ns 1.01
array/iteration/logical 218108.5 ns 218146 ns 1.00
array/iteration/findmin/1d 51094 ns 53029 ns 0.96
array/iteration/findmin/2d 97115 ns 96965 ns 1.00
array/reductions/reduce/Int64/1d 43784 ns 43857 ns 1.00
array/reductions/reduce/Int64/dims=1 44830.5 ns 44929 ns 1.00
array/reductions/reduce/Int64/dims=2 61551 ns 61783 ns 1.00
array/reductions/reduce/Int64/dims=1L 89033 ns 89162 ns 1.00
array/reductions/reduce/Int64/dims=2L 88110 ns 88307 ns 1.00
array/reductions/reduce/Float32/1d 38533 ns 37230 ns 1.03
array/reductions/reduce/Float32/dims=1 42108 ns 50726 ns 0.83
array/reductions/reduce/Float32/dims=2 60059 ns 60284 ns 1.00
array/reductions/reduce/Float32/dims=1L 52436 ns 52649 ns 1.00
array/reductions/reduce/Float32/dims=2L 72413 ns 72742 ns 1.00
array/reductions/mapreduce/Int64/1d 43803 ns 43667 ns 1.00
array/reductions/mapreduce/Int64/dims=1 44661.5 ns 44472 ns 1.00
array/reductions/mapreduce/Int64/dims=2 61855 ns 61760 ns 1.00
array/reductions/mapreduce/Int64/dims=1L 89060 ns 89084 ns 1.00
array/reductions/mapreduce/Int64/dims=2L 88372.5 ns 88448.5 ns 1.00
array/reductions/mapreduce/Float32/1d 38249.5 ns 37214 ns 1.03
array/reductions/mapreduce/Float32/dims=1 42299 ns 43130 ns 0.98
array/reductions/mapreduce/Float32/dims=2 60161 ns 60025 ns 1.00
array/reductions/mapreduce/Float32/dims=1L 52739 ns 52879 ns 1.00
array/reductions/mapreduce/Float32/dims=2L 72367 ns 72506 ns 1.00
array/broadcast 20539 ns 20173 ns 1.02
array/copyto!/gpu_to_gpu 13077 ns 11263 ns 1.16
array/copyto!/cpu_to_gpu 217786 ns 216321 ns 1.01
array/copyto!/gpu_to_cpu 285279.5 ns 283906 ns 1.00
array/accumulate/Int64/1d 125337 ns 125185 ns 1.00
array/accumulate/Int64/dims=1 83731.5 ns 83290 ns 1.01
array/accumulate/Int64/dims=2 157967 ns 157793 ns 1.00
array/accumulate/Int64/dims=1L 1710227 ns 1709512 ns 1.00
array/accumulate/Int64/dims=2L 966796 ns 966574.5 ns 1.00
array/accumulate/Float32/1d 109130 ns 109309 ns 1.00
array/accumulate/Float32/dims=1 80559 ns 80361 ns 1.00
array/accumulate/Float32/dims=2 147918.5 ns 147589 ns 1.00
array/accumulate/Float32/dims=1L 1618928 ns 1618990 ns 1.00
array/accumulate/Float32/dims=2L 698837.5 ns 698423 ns 1.00
array/construct 1300.3 ns 1284 ns 1.01
array/random/randn/Float32 47430 ns 48672 ns 0.97
array/random/randn!/Float32 25085 ns 25210 ns 1.00
array/random/rand!/Int64 27522 ns 27299 ns 1.01
array/random/rand!/Float32 8885.333333333334 ns 8794.333333333334 ns 1.01
array/random/rand/Int64 29911 ns 30095 ns 0.99
array/random/rand/Float32 13053 ns 13270 ns 0.98
array/permutedims/4d 55464 ns 55523.5 ns 1.00
array/permutedims/2d 54269 ns 54033.5 ns 1.00
array/permutedims/3d 55259 ns 55090.5 ns 1.00
array/sorting/1d 2758835 ns 2758035 ns 1.00
array/sorting/by 3345979 ns 3344921 ns 1.00
array/sorting/2d 1082198.5 ns 1081700.5 ns 1.00
cuda/synchronization/stream/auto 1036 ns 1021.1 ns 1.01
cuda/synchronization/stream/nonblocking 7987.700000000001 ns 7536.4 ns 1.06
cuda/synchronization/stream/blocking 799.040404040404 ns 790.5204081632653 ns 1.01
cuda/synchronization/context/auto 1191.6 ns 1172.4 ns 1.02
cuda/synchronization/context/nonblocking 8919.2 ns 8350.5 ns 1.07
cuda/synchronization/context/blocking 910.0930232558139 ns 899.8913043478261 ns 1.01

This comment was automatically generated by workflow using github-action-benchmark.

@matteosecli
Copy link
Author

matteosecli commented May 27, 2025

Thanks. Can you add some tests?

I've added some, though most fail due to the fact that LinearAlgebra.eig*() for nonsymmetric standard arrays return sorted eigenpairs/values (see LinearAlgebra.jl/src/eigen.jl#L128-L140), while the existing and new overloads for CUDA arrays do not.

Should we return ordered eigenpairs, too? Or should we just order them in the tests alone? Reordering could impact performance.

@matteosecli
Copy link
Author

matteosecli commented May 27, 2025

I've simplified a bit the eigvecs functions and fixed the symmetric eigvals functions. Now all the tests for symmetric routines pass.

Unfortunately, it seems that the output of Xgeev needs to be reordered to match LAPACK's output. I've added a quick sorteig! function adapted from LinearAlgebra in order for the tests to pass, but I would avoid reordering tbh and just use that function for testing purposes.

As for geev testing, stuff like LAPACK.geev!('N','V', CuArray(A)) (which is instead tested for e.g. syevd) won't work as there is no explicit overload for that method in the code (perhaps due to missing typed routines?). I've commented that testing section for not, but tbh I'd just remove it if that's ok for you.

FInally, I still cannot get the two @test abs.(h_V⁻¹*Eig.vectors) ≈ I tests to work for geev, any idea? From a quick look it seems like Eig.vectors*h_V⁻¹ is almost an identity matrix, although with a not-so-great precision.

@matteosecli
Copy link
Author

It turns out I accidentally overlooked the final basis transformation necessary for real geev routines to return proper eigenvectors; it should be fixed now.

The only points left are whether to do eigenvalues/vectors sorting or not, and what to do with the commented out geev test.

@maleadt
Copy link
Member

maleadt commented May 28, 2025

Lacking familiarity, some superficial thoughts.

whether to do eigenvalues/vectors sorting or not

Given that sorting is relatively expensive, and IIUC does not practically matter if only for convenience/reproducibility, I'd only do this in the tests.

what to do with the commented out geev test

What functionality is missing? We typically do not "overload" LAPACK methods like LAPACK.geev!. Why doesn'tCUSOLVER.Xgeev! work?

@matteosecli
Copy link
Author

Given that sorting is relatively expensive, and IIUC does not practically matter if only for convenience/reproducibility, I'd only do this in the tests.

Great, I'll do that.

What functionality is missing? We typically do not "overload" LAPACK methods like LAPACK.geev!. Why doesn'tCUSOLVER.Xgeev! work?

The geev equivalent of these definitions:

CUDA.jl/lib/cusolver/dense.jl

Lines 1053 to 1063 in 8b6a2a0

for elty in (:Float32, :Float64)
@eval begin
LAPACK.syevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}) = syevd!(jobz, uplo, A)
end
end
for elty in (:ComplexF32, :ComplexF64)
@eval begin
LAPACK.syevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}) = heevd!(jobz, uplo, A)
end
end

which in turn use these definitions:
for (jname, bname, fname, elty, relty) in ((:syevd!, :cusolverDnSsyevd_bufferSize, :cusolverDnSsyevd, :Float32, :Float32),
(:syevd!, :cusolverDnDsyevd_bufferSize, :cusolverDnDsyevd, :Float64, :Float64),
(:heevd!, :cusolverDnCheevd_bufferSize, :cusolverDnCheevd, :ComplexF32, :Float32),
(:heevd!, :cusolverDnZheevd_bufferSize, :cusolverDnZheevd, :ComplexF64, :Float64))
@eval begin
function $jname(jobz::Char,
uplo::Char,
A::StridedCuMatrix{$elty})
chkuplo(uplo)
n = checksquare(A)
lda = max(1, stride(A, 2))
W = CuArray{$relty}(undef, n)
dh = dense_handle()
function bufferSize()
out = Ref{Cint}(0)
$bname(dh, jobz, uplo, n, A, lda, W, out)
return out[] * sizeof($elty)
end
with_workspace(dh.workspace_gpu, bufferSize) do buffer
$fname(dh, jobz, uplo, n, A, lda, W,
buffer, sizeof(buffer) ÷ sizeof($elty), dh.info)
end
info = @allowscalar dh.info[1]
chkargsok(BlasInt(info))
if jobz == 'N'
return W
elseif jobz == 'V'
return W, A
end
end
end
end

is missing.

The definitions above use a "legacy" API, with type-dependent function names (e.g. *Ssyevd, *Dsyevd, etc), while CUDA offers also a "generic" API (see docs) in which types are passed as parameters and there is just one function name (e.g. *Xsyevd).

The generic API functions are translated in dense_generic.jl, e.g.:

# Xsyevd
function Xsyevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{T}) where {T <: BlasFloat}
chkuplo(uplo)
n = checksquare(A)
R = real(T)
lda = max(1, stride(A, 2))
W = CuVector{R}(undef, n)
params = CuSolverParameters()
dh = dense_handle()
function bufferSize()
out_cpu = Ref{Csize_t}(0)
out_gpu = Ref{Csize_t}(0)
cusolverDnXsyevd_bufferSize(dh, params, jobz, uplo, n,
T, A, lda, R, W, T, out_gpu, out_cpu)
out_gpu[], out_cpu[]
end
with_workspaces(dh.workspace_gpu, dh.workspace_cpu, bufferSize()...) do buffer_gpu, buffer_cpu
cusolverDnXsyevd(dh, params, jobz, uplo, n, T, A,
lda, R, W, T, buffer_gpu, sizeof(buffer_gpu),
buffer_cpu, sizeof(buffer_cpu), dh.info)
end
flag = @allowscalar dh.info[1]
chkargsok(flag |> BlasInt)
if jobz == 'N'
return W
elseif jobz == 'V'
return W, A
end
end

So, while the symmetric diagonalization routines have both the "legacy" and "generic" API, the non-symmetric diagonalization routine has just the "generic" API Xgeev.

This results in a somewhat unbalanced situation in the existing codebase. Calling e.g. LAPACK.syev!(..., CuArray(A)) for a symmetric matrix actually forwards to CUSOLVER, while calling LAPACK.geev!(..., CuArray(A)) for a generic matrix does not, and produces an error.

@maleadt
Copy link
Member

maleadt commented May 28, 2025

The geev equivalent of these definitions:

CUDA.jl/lib/cusolver/dense.jl

Lines 1053 to 1063 in 8b6a2a0

for elty in (:Float32, :Float64)
@eval begin
LAPACK.syevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}) = syevd!(jobz, uplo, A)
end
end
for elty in (:ComplexF32, :ComplexF64)
@eval begin
LAPACK.syevd!(jobz::Char, uplo::Char, A::StridedCuMatrix{$elty}) = heevd!(jobz, uplo, A)
end
end

It's unfortunate that we (still) have these (again). The Base LAPACK interface isn't really meant to be extended; IME it's better to implement interfaces at a higher level (e.g. LinearAlgebra) and dispatch to methods specific to CUDA libraries. There are also often incompatibilities between the CPU and GPU LAPACK interfaces that prevent this kind of code reuse.
That said, things are working, so it's not really a priority to change all that.

@matteosecli
Copy link
Author

It's unfortunate that we (still) have these (again). The Base LAPACK interface isn't really meant to be extended; IME it's better to implement interfaces at a higher level (e.g. LinearAlgebra) and dispatch to methods specific to CUDA libraries. There are also often incompatibilities between the CPU and GPU LAPACK interfaces that prevent this kind of code reuse. That said, things are working, so it's not really a priority to change all that.

Sounds great, I'll then remove the commented out tests.

As a side note, I've tried to adhere to the whitespace style of current tests, but runic is not happy; do you really want me to reformat as runic says?

@maleadt
Copy link
Member

maleadt commented May 28, 2025

As a side note, I've tried to adhere to the whitespace style of current tests, but runic is not happy; do you really want me to reformat as runic says?

No need.

@matteosecli
Copy link
Author

Hello, are there any news about this?

@maleadt maleadt force-pushed the feature-nonsymmetric-eigen branch from 3d0f36d to 56be9b3 Compare June 18, 2025 06:22
@codecov
Copy link

codecov bot commented Jun 18, 2025

Codecov Report

All modified and coverable lines are covered by tests ✅

Project coverage is 89.57%. Comparing base (9db8376) to head (56be9b3).
Report is 1 commits behind head on master.

Additional details and impacted files
@@            Coverage Diff             @@
##           master    #2787      +/-   ##
==========================================
+ Coverage   89.50%   89.57%   +0.06%     
==========================================
  Files         153      153              
  Lines       13298    13340      +42     
==========================================
+ Hits        11903    11949      +46     
+ Misses       1395     1391       -4     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@maleadt
Copy link
Member

maleadt commented Jun 18, 2025

Sorry, I was waiting for CI to recover.

Why the added CUSOLVER.version() >= v"11.7.1" conditionals?

@matteosecli
Copy link
Author

Why the added CUSOLVER.version() >= v"11.7.1" conditionals?

Xgeev was introduced in CUDA 12.6.2, which should correspond to CUSOLVER 11.7.1: https://docs.nvidia.com/cuda/archive/12.6.2/cuda-toolkit-release-notes/index.html

@maleadt
Copy link
Member

maleadt commented Jun 18, 2025

Right, but eigen previously worked (and was tested) on CUDA 11.4 - 12.8, while now it requires CUDA 12.6. The original functionality probably should be retained, ideally in a way that makes easy to excise once we bump the minimal CUDA version.

@matteosecli
Copy link
Author

matteosecli commented Nov 13, 2025

Hi @maleadt, sorry, didn't have much time to work on this.

I've removed the extra checks, leaving only the one that explicitly tests Xgeev (as we need CUSOLVER 11.7.1 for that).

Edit: I haven't been able to check the tests as they fail with Julia 1.12 on an unrelated testcase.

@matteosecli
Copy link
Author

It seems CI is properly running again and tests are passing on older CUDA versions as well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

cuda libraries Stuff about CUDA library wrappers. enhancement New feature or request needs tests Tests are requested.

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants