cd /tmp
libver=0.3.21
wget https://github.com/xianyi/OpenBLAS/releases/download/v$libver/OpenBLAS-$libver.tar.gz
tar xf OpenBLAS-$libver.tar.gz
cd OpenBLAS-$libver
export USE_THREAD=1
export DYNAMIC_ARCH=0
export NO_WARMUP=1
export BUILD_RELAPACK=0
export BINARY=64
export INTERFACE64=1
#export SYMBOLSUFFIX=64_
#export USE_OPENMP=1
#export LIBNAMESUFFIX=omp
#export COMMON_OPT="-O3 -march=native"
#export CFLAGS="-O3 -march=native"
#export FCOMMON_OPT="-O3 -march=native"
#export FCFLAGS="-O3 -march=native"
ulimit -s unlimited
time make -j
AMD R9 3900x: user=605.74s system=110.45s cpu=1430% total=50.053
Intel I5 12600k: user=671.91s system=80.67s cpu=1184% total=55.055
AMD R5 5600G: user=803.87s system=85.05s cpu=959% total=1:21.65
#sudo apt install python-is-python3
time make -j lapack-test
Intel I5 12600k: user=310.05s system=703.66s cpu=794% total=1:34.60
AMD R5 5600G: user=183.18s system=243.94s cpu=406% total=1:44.99
AMD R9 3900x: user=266.44s system=557.51s cpu=678% total=2:01.52
sudo make install
And add the following env variables to your ~/.bashrc
OR ~/.zshrc
file:
tee -a $HOME/.bashrc $HOME/.zshrc > /dev/null <<EOT
# OpenBLAS
export C_INCLUDE_PATH=\$C_INCLUDE_PATH:/opt/OpenBLAS/include
export CPATH=\$CPATH:/opt/OpenBLAS/include
export LIBRARY_PATH=\$LIBRARY_PATH:/opt/OpenBLAS/lib
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/OpenBLAS/lib
EOT
Now reload your shell:
exec $SHELL
Now we are ready to test/benchmark our program. Source code follows:
! --------------------------------------------------------------------------------------
! Purpose: Solves a simple eigenvalue/eigenvector problem in order to benchmark the
! compiler, the library and/or the CPU.
!
! How to compile and build:
!
! gfortran + openblas:
! gfortran -pthread -fmax-stack-var-size=419430400 -O2 ./hermitianEigen.f90 -o ./hermitianEigen /opt/OpenBLAS/lib/libopenblas.a
!
! Intel fortran + MKL:
! ifort -O2 -qmkl ./hermitianEigen.f90 -o ./hermitianEigen-ifort
!
! Last Modified: 20220725
! --------------------------------------------------------------------------------------
program hermitianEigen
use, intrinsic :: iso_fortran_env
implicit none
integer, parameter :: rp = REAL64 ! default real precision
integer, parameter :: ip = INT64 ! default integer precision
integer(ip), parameter :: n = 4096_ip ! hermitian matrix dimension
integer(ip), parameter :: lda = n, ldz = n
integer(ip) :: i, j, k, info, lwmax, lwork, lrwork, liwork, &
& il, iu, m, nb, start_time, end_time, rate
real(rp) :: abstol, vl, vu
! static/stack allocated arrays
real(rp) :: w(n), revals(n*n), imvals(n*n), diag(n)
integer(ip) :: isuppz(n)
complex(rp) :: mat(lda,n), z(ldz, n)
! dynamic allocated arrays
real(rp), allocatable, dimension(:) :: rwork
integer(ip), allocatable, dimension(:) :: iwork
complex(rp), allocatable, dimension(:) :: work
intrinsic :: random_seed, random_number
intrinsic :: int, min, max
!external :: openblas_set_num_threads
external :: zheevr
!call openblas_set_num_threads(8)
call system_clock (count_rate=rate)
call system_clock (count=start_time)
abstol = -1.0_rp
write(*,*) '01. Generating random numbers for hermitian matrix... '
call random_seed()
call random_number(revals)
call random_seed()
call random_number(imvals)
write(*,*) '02. Building our hermitian matrix...'
! create hermitian matrix
k = 0_ip
do j = 1, n
do i = 1, n
k = k + 1_ip
mat(i,j) = cmplx(revals(k), imvals(k))
if (i == j) then
mat(i,j)%im = 0.0_rp
diag(j) = mat(i,j)
else if ( (mat(i,j) /= mat(j,i)) .and. (j > i) ) then
mat(i,j)%re = mat(j,i)%re
mat(i,j)%im = -mat(j,i)%im
end if
end do
end do
! boundary conditions
mat(1,n) = (0_rp, 1_rp)
mat(n,1) = (0_rp, -1_rp)
write(*,*) '03. Query the optimal workspace...'
! allocate arrays
allocate(iwork(1), rwork(1), work(1))
! query the optimal workspace
lwork = -1
lrwork = -1
liwork = -1
call zheevr('N', 'All', 'Lower', n, mat, lda, vl, vu, il, &
& iu, abstol, m, w, z, ldz, isuppz, work, lwork, rwork, &
& lrwork, iwork, liwork, info)
if ( info .gt. 0 ) then
write(*,*) 'Failed to find optimal work array dimension.'
stop
end if
! work arrays dimension
lwork = max(1, int(work(1)))
lrwork = max(1, int(rwork(1)))
liwork = max(1, iwork(1))
write(*,"(1x,a,3i7)") '04. Allocating memory for dynamic arrays of sizes ', lwork, lrwork, liwork
deallocate(iwork, rwork, work)
allocate(iwork(liwork), rwork(lrwork), work(lwork))
write(*,*) '05. Computing all eigenvalues and vectors...'
! solve eigenproblem
call zheevr( 'Vectors', 'All', 'Lower', n, mat, lda, vl, vu, il, &
& iu, abstol, m, w, z, ldz, isuppz, work, lwork, rwork, &
& lrwork, iwork, liwork, info )
! check for convergence
if ( info .gt. 0 ) then
write(*,*) 'The algorithm failed to compute eigenvalues.'
stop
else
write(*,*) ' OK: all eigenvalues and vectors computed successfully!'
end if
write(*,*) '06. Computing only the first 10 eigenvalues...'
! reconstruct diagonal elements
do i = 1, n
mat(i,i) = diag(i)
end do
! eigenvals interval
il = 1
iu = min(10, n-1)
! solve eigenproblem again
call zheevr( 'N', 'Interval', 'Upper', n, mat, lda, vl, vu, il, &
& iu, abstol, m, w, z, ldz, isuppz, work, lwork, rwork, &
& lrwork, iwork, liwork, info )
! check for convergence
if ( info .gt. 0 ) then
write(*,*) 'The algorithm failed to compute eigenvalues.'
stop
else
write(*,*) ' OK: first eigenvalues computed successfully!'
write(*,*) ' The total number of eigenvalues found: ', m
end if
call system_clock (count=end_time)
write(*,"(/a,f8.3,a)") "Took: ", real(end_time - start_time)/real(rate), " seconds"
end program hermitianEigen
Build and run eigensolver (gfortran+openblas):
gfortran -pthread -fmax-stack-var-size=419430400 -O2 ./hermitianEigen.f90 -o ./hermitianEigen /opt/OpenBLAS/lib/libopenblas.a
time ./hermitianEigen
Build and run (ifort+mkl):
ifort -O2 -qmkl ./hermitianEigen.f90 -o ./hermitianEigen-ifort
time ./hermitianEigen-ifort
If you like to execute the benchmark program using fewer threads than available:
export MAX_THREADS=4
export OPENBLAS_NUM_THREADS=$MAX_THREADS
export MKL_NUM_THREADS=$MAX_THREADS
export OMP_NUM_THREADS=$MAX_THREADS
export GOTO_NUM_THREADS=$MAX_THREADS
export BLIS_NUM_THREADS=$MAX_THREADS
export VECLIB_MAXIMUM_THREADS=$MAX_THREADS
export NUMEXPR_NUM_THREADS=$MAX_THREADS
export NUMBA_NUM_THREADS=$MAX_THREADS
Intel I5 12600K: Took: 16.810 seconds
(gfortran) | Took: 17.295 seconds
(ifort)
AMD R5 5600G: Took: 22.643 seconds
(gfortran) | Took: 30.434 seconds
(ifort)
AMD R9 3900x: Took: 20.071 seconds
(gfortran) | Took: 27.343 seconds
(ifort)