Last active
August 2, 2022 03:18
-
-
Save bgeneto/155dbaf4e56fae9015b40c16c4268b7a to your computer and use it in GitHub Desktop.
OpenBLAS ZHEEV slow performance bug
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
! -------------------------------------------------------------------------------------- | |
! | |
! 01. Purpose: Solves a simple eigenvalue/eigenvector problem in order to check | |
! for the zheev thread locking bug #1560 | |
! https://github.com/xianyi/OpenBLAS/issues/1560 | |
! | |
! 02. Compile and build | |
! | |
! export MAX_THREADS=4 | |
! export OPENBLAS_NUM_THREADS=$MAX_THREADS | |
! export MKL_NUM_THREADS=$MAX_THREADS | |
! export OMP_NUM_THREADS=$MAX_THREADS | |
! export GOTO_NUM_THREADS=$MAX_THREADS | |
! export BLIS_NUM_THREADS=$MAX_THREADS | |
! | |
! gfortran + openblas: | |
! gfortran -O2 -m64 -mcmodel=medium -fmax-stack-var-size=2147483647 -pthread ./hermitianEigen.f90 -o ./hermitianEigen -lopenblas | |
! | |
! gfortran + mkl: | |
! gfortran -O2 -m64 -mcmodel=medium -fmax-stack-var-size=2147483647 ./hermitianEigen.f90 -o ./hermitianEigen-mkl -Wl,--no-as-needed -lmkl_gf_lp64 -lmkl_tbb_thread -lmkl_core -lpthread -lm -ldl | |
! | |
! Intel fortran + MKL: | |
! ifort -O2 -i8 -qmkl=parallel ./hermitianEigen.f90 -o ./hermitianEigen-ifort -lpthread -lm -ldl | |
! | |
! 03. Run | |
! # to generate a positive definite hermitian matrix: | |
! ./hermitianEigen -p | |
! # to generate a generic hermitian matrix: | |
! ./hermitianEigen | |
! | |
! Last Modified: 20220801 | |
! Author: bgeneto | |
! | |
! -------------------------------------------------------------------------------------- | |
program hermitianEigen | |
use, intrinsic :: iso_fortran_env | |
implicit none | |
integer, parameter :: rp = REAL64 ! default real precision | |
integer, parameter :: ip = INT64 ! default integer precision | |
integer(ip), parameter :: n = 4096_ip ! hermitian matrix dimension | |
integer(ip), parameter :: lda = n, ldz = n ! leading dimensions | |
integer :: seed(4) = [0, 0, 0, 1] | |
integer(ip) :: i, j, k, info, lwork, lrwork, liwork, & | |
& il, iu, m, start_time, end_time, rate | |
real(rp) :: abstol, vl, vu | |
! static/stack allocated arrays | |
real(rp) :: w(n), revals(n*n), imvals(n*n) | |
integer(ip) :: isuppz(n) | |
complex(rp) :: mat(lda,n), bmat(lda,n), z(ldz,n) | |
! dynamic allocated arrays | |
real(rp), allocatable, dimension(:) :: rwork | |
integer(ip), allocatable, dimension(:) :: iwork | |
complex(rp), allocatable, dimension(:) :: work | |
character(100) :: param | |
logical :: pdm = .false. | |
intrinsic :: random_seed, random_number | |
intrinsic :: int, min, max | |
external :: zheev, zheevr | |
call system_clock (count_rate=rate) | |
call system_clock (count=start_time) | |
! uses the default tolerance | |
abstol = -1.0_rp | |
! check command line parameter (in order to generate a positive definite matrix) | |
cmdarg: if (command_argument_count() > 0) then | |
call get_command_argument(1, param) | |
param = trim(adjustl(param)) | |
if (param(1:2) .eq. '-p' .or. param(1:2) .eq. '/p') then | |
pdm = .true. | |
end if | |
end if cmdarg | |
! generate matrix | |
matrix: if (pdm) then | |
! generate random hermitian positive definite matrix | |
! in this case zheev runs faster than zheevr... | |
write(*,*) '01. Generating random numbers for hermitian matrix... ' | |
call zlarnv(1, seed, lda*n, mat) | |
write(*,*) '02. Building our positive definite hermitian matrix...' | |
mat = mat * conjg(transpose(mat)) | |
do i = 1, n | |
mat(i,i) = mat(i,i) + n | |
end do | |
else | |
write(*,*) '01. Generating random numbers for hermitian matrix... ' | |
call random_seed() | |
call random_number(revals) | |
call random_seed() | |
call random_number(imvals) | |
write(*,*) '02. Building our generic hermitian matrix...' | |
! dummy way to create our hermitian matrix | |
k = 0_ip | |
do j = 1, n | |
do i = 1, n | |
k = k + 1_ip | |
mat(i,j) = cmplx(revals(k), imvals(k), kind=rp) | |
if (i == j) then | |
mat(i,j)%im = 0.0_rp | |
else if ( (mat(i,j) /= mat(j,i)) .and. (j > i) ) then | |
mat(i,j)%re = mat(j,i)%re | |
mat(i,j)%im = -mat(j,i)%im | |
end if | |
end do | |
end do | |
end if matrix | |
! boundary conditions | |
mat(1,n) = mat(n,1) | |
! backup matrix | |
bmat = mat | |
! ------------------ zheevr ------------------ | |
write(*,*) '03. Computing using ZHEEVR...' | |
! temporary allocate arrays | |
allocate(iwork(1), rwork(1), work(1)) | |
! query optimal workspace array dimensions | |
lwork = -1 | |
lrwork = -1 | |
liwork = -1 | |
call zheevr('V', 'A', 'U', n, mat, lda, vl, vu, il, & | |
& iu, abstol, m, w, z, ldz, isuppz, work, lwork, rwork, & | |
& lrwork, iwork, liwork, info) | |
if ( info .gt. 0 ) then | |
write(*,*) 'Failed to find optimal work array dimensions.' | |
stop | |
end if | |
! work arrays dimensions | |
lwork = max(2*n, int(work(1))) | |
lrwork = max(24*n, int(rwork(1))) | |
liwork = min(10*n, iwork(1)) | |
write(*,"(1x,a,3i9)") '04. Optimal workspace for ZHEEVR computed: ', liwork, lrwork, lwork | |
deallocate(iwork, rwork, work) | |
allocate(iwork(liwork), rwork(lrwork), work(lwork)) | |
write(*,*) '05. Computing all eigenvalues and eigenvectors using ZHEEVR...' | |
! now we solve the eigenproblem | |
call zheevr( 'V', 'A', 'U', n, mat, lda, vl, vu, il, & | |
& iu, abstol, m, w, z, ldz, isuppz, work, lwork, rwork, & | |
& lrwork, iwork, liwork, info ) | |
! check for convergence | |
if ( info .gt. 0 ) then | |
write(*,*) 'ERROR: Algorithm ZHEEVR failed to compute eigenvalues.' | |
stop | |
else | |
write(*,*) ' OK: all eigenvalues/vectors computed successfully by ZHEEVR!' | |
end if | |
call system_clock (count=end_time) | |
write(*,"(a,f8.3,a)") "ZHEEVR took: ", real(end_time - start_time)/real(rate), " seconds" | |
! ------------------ zheev ------------------ | |
call system_clock (count_rate=rate) | |
call system_clock (count=start_time) | |
write(*,*) '06. Now using ZHEEV, so be patient (switch to single thread bug)...' | |
! temporary allocate arrays | |
deallocate(iwork, rwork, work) | |
allocate(iwork(1), rwork(1), work(1)) | |
! query the optimal workspace | |
lwork = -1 | |
call zheev( 'V', 'U', n, bmat, lda, w, work, lwork, rwork, info ) | |
if ( info .gt. 0 ) then | |
write(*,*) 'ERROR: Failed to find optimal work array dimension.' | |
stop | |
end if | |
! work arrays dimension | |
lwork = max(2*n-1, int(work(1))) | |
lrwork = max(1, 3*n-2) | |
write(*,"(1x,a,3i9)") '07. Optimal workspace for ZHEEV computed: ', lrwork, lwork | |
! proper array allocation | |
deallocate(rwork, work) | |
allocate(rwork(lrwork), work(lwork)) | |
write(*,*) '08. Computing all eigenvalues and eigenvectors using ZHEEV...' | |
! solve eigenproblem | |
call zheev( 'V', 'U', n, bmat, lda, w, work, lwork, rwork, info ) | |
! check for convergence | |
if ( info .gt. 0 ) then | |
write(*,*) 'ERROR: Algorithm ZHEEV failed to compute eigenvalues.' | |
stop | |
else | |
write(*,*) ' OK: all eigenvalues/vectors computed successfully by ZHEEV!' | |
end if | |
call system_clock (count=end_time) | |
write(*,"(a,f8.3,a)") "ZHEEV took: ", real(end_time - start_time)/real(rate), " seconds" | |
end program hermitianEigen |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment