Commit 4ca77db1 authored by Victor Yu's avatar Victor Yu

Stop ifort from creating temp array in ELPA2 kernel

parent 9ed1e6ab
# ELSI changelog # ELSI changelog
## v2.6.1 (June 2020) ## Not released
### PEXSI ### ELPA
* Removed an improper abort from the error handling code of PEXSI. * Fix a performance regression of the ELPA2 generic kernel.
### Known issues ### Known issues
* The ELPA code can not be compiled with the NAG Fortran compiler, due to the * The ELPA code can not be compiled with the NAG Fortran compiler, due to the
...@@ -13,6 +13,11 @@ ...@@ -13,6 +13,11 @@
* Depending on the choice of k-points, the complex PEXSI solver may randomly * Depending on the choice of k-points, the complex PEXSI solver may randomly
fail at the inertia counting stage. fail at the inertia counting stage.
## v2.6.1 (June 2020)
### PEXSI
* Removed an improper abort from the error handling code of PEXSI.
## v2.6.0 (June 2020) ## v2.6.0 (June 2020)
### ELSI interface ### ELSI interface
......
...@@ -7,7 +7,7 @@ SET(elsi_URL "http://elsi-interchange.org") ...@@ -7,7 +7,7 @@ SET(elsi_URL "http://elsi-interchange.org")
SET(elsi_EMAIL "elsi-team@duke.edu") SET(elsi_EMAIL "elsi-team@duke.edu")
SET(elsi_LICENSE "BSD 3") SET(elsi_LICENSE "BSD 3")
SET(elsi_DESCRIPTION "Electronic Structure Infrastructure") SET(elsi_DESCRIPTION "Electronic Structure Infrastructure")
SET(elsi_DATESTAMP "20200629") SET(elsi_DATESTAMP "20200718")
### CMake modules ### ### CMake modules ###
LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
......
...@@ -29,8 +29,9 @@ LIST(APPEND elpa_src ...@@ -29,8 +29,9 @@ LIST(APPEND elpa_src
src/mod_precision.f90 src/mod_precision.f90
src/mod_redist_band.f90 src/mod_redist_band.f90
src/mod_scalapack_interfaces.f90 src/mod_scalapack_interfaces.f90
src/mod_single_hh_trafo_real.f90
src/resident_set_size.c src/resident_set_size.c
src/single_hh_trafo_real_dp.f90
src/single_hh_trafo_real_sp.f90
src/time.c src/time.c
src/virtual_memory.c) src/virtual_memory.c)
......
...@@ -3683,7 +3683,7 @@ contains ...@@ -3683,7 +3683,7 @@ contains
integer(kind=ik) :: i, j, lwork, liwork, info integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:) integer(kind=BLAS_KIND), allocatable :: iwork(:)
logical, intent(in) :: wantDebug logical, intent(in) :: wantDebug
logical, intent(out) :: success logical, intent(out) :: success
...@@ -3709,7 +3709,7 @@ contains ...@@ -3709,7 +3709,7 @@ contains
call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage) call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage)
call obj%timer%start("blas") call obj%timer%start("blas")
call DSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), & call DSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), & work, int(lwork,kind=BLAS_KIND), iwork, int(liwork,kind=BLAS_KIND), &
infoBLAS) infoBLAS)
info = int(infoBLAS,kind=ik) info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas") call obj%timer%stop("blas")
...@@ -4275,7 +4275,7 @@ contains ...@@ -4275,7 +4275,7 @@ contains
integer(kind=ik) :: i, j, lwork, liwork, info integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:) integer(kind=BLAS_KIND), allocatable :: iwork(:)
logical, intent(in) :: wantDebug logical, intent(in) :: wantDebug
logical, intent(out) :: success logical, intent(out) :: success
...@@ -4301,7 +4301,7 @@ contains ...@@ -4301,7 +4301,7 @@ contains
call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage) call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage)
call obj%timer%start("blas") call obj%timer%start("blas")
call DSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), & call DSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), & work, int(lwork,kind=BLAS_KIND), iwork, int(liwork,kind=BLAS_KIND), &
infoBLAS) infoBLAS)
info = int(infoBLAS,kind=ik) info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas") call obj%timer%stop("blas")
...@@ -7546,7 +7546,7 @@ contains ...@@ -7546,7 +7546,7 @@ contains
integer(kind=ik) :: i, j, lwork, liwork, info integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:) integer(kind=BLAS_KIND), allocatable :: iwork(:)
logical, intent(in) :: wantDebug logical, intent(in) :: wantDebug
logical, intent(out) :: success logical, intent(out) :: success
...@@ -7572,7 +7572,7 @@ contains ...@@ -7572,7 +7572,7 @@ contains
call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage) call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage)
call obj%timer%start("blas") call obj%timer%start("blas")
call SSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), & call SSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), & work, int(lwork,kind=BLAS_KIND), iwork, int(liwork,kind=BLAS_KIND), &
infoBLAS) infoBLAS)
info = int(infoBLAS,kind=ik) info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas") call obj%timer%stop("blas")
...@@ -8138,7 +8138,7 @@ contains ...@@ -8138,7 +8138,7 @@ contains
integer(kind=ik) :: i, j, lwork, liwork, info integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:) integer(kind=BLAS_KIND), allocatable :: iwork(:)
logical, intent(in) :: wantDebug logical, intent(in) :: wantDebug
logical, intent(out) :: success logical, intent(out) :: success
...@@ -8164,7 +8164,7 @@ contains ...@@ -8164,7 +8164,7 @@ contains
call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage) call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage)
call obj%timer%start("blas") call obj%timer%start("blas")
call SSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), & call SSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), & work, int(lwork,kind=BLAS_KIND), iwork, int(liwork,kind=BLAS_KIND), &
infoBLAS) infoBLAS)
info = int(infoBLAS,kind=ik) info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas") call obj%timer%stop("blas")
......
...@@ -70,8 +70,6 @@ contains ...@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -132,7 +130,7 @@ contains ...@@ -132,7 +130,7 @@ contains
&real& &real&
&_generic_& &_generic_&
&double& &double&
& (a(1:stripe_width,j+off+a_off-1:a_dim2,istripe),w, nbw, nl, stripe_width, nbw) & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
enddo enddo
...@@ -140,8 +138,7 @@ contains ...@@ -140,8 +138,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&double& &double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
...@@ -171,8 +168,6 @@ contains ...@@ -171,8 +168,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -233,7 +228,7 @@ contains ...@@ -233,7 +228,7 @@ contains
&real& &real&
&_generic_& &_generic_&
&single& &single&
& (a(1:stripe_width,j+off+a_off-1:a_dim2,istripe),w, nbw, nl, stripe_width, nbw) & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
enddo enddo
...@@ -241,8 +236,7 @@ contains ...@@ -241,8 +236,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&single& &single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
...@@ -330,7 +324,7 @@ contains ...@@ -330,7 +324,7 @@ contains
&complex& &complex&
&_generic_& &_generic_&
&double& &double&
& (a(1:stripe_width,j+off+a_off:a_dim2,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width) & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo enddo
...@@ -413,7 +407,7 @@ contains ...@@ -413,7 +407,7 @@ contains
&complex& &complex&
&_generic_& &_generic_&
&single& &single&
& (a(1:stripe_width,j+off+a_off:a_dim2,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width) & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo enddo
......
...@@ -70,8 +70,6 @@ contains ...@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -139,8 +137,7 @@ contains ...@@ -139,8 +137,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&double& &double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
...@@ -170,8 +167,6 @@ contains ...@@ -170,8 +167,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -239,8 +234,7 @@ contains ...@@ -239,8 +234,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&single& &single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
......
...@@ -70,8 +70,6 @@ contains ...@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -139,8 +137,7 @@ contains ...@@ -139,8 +137,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&double& &double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
...@@ -170,8 +167,6 @@ contains ...@@ -170,8 +167,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -239,8 +234,7 @@ contains ...@@ -239,8 +234,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&single& &single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
......
...@@ -70,8 +70,6 @@ contains ...@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -139,8 +137,7 @@ contains ...@@ -139,8 +137,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&double& &double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
...@@ -170,8 +167,6 @@ contains ...@@ -170,8 +167,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -239,8 +234,7 @@ contains ...@@ -239,8 +234,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&single& &single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
......
...@@ -70,8 +70,6 @@ contains ...@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -155,7 +153,7 @@ contains ...@@ -155,7 +153,7 @@ contains
&real& &real&
&_generic_& &_generic_&
&double& &double&
& (a(1:stripe_width,j+off+a_off-1:a_dim2,istripe),w, nbw, nl, stripe_width, nbw) & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
enddo enddo
...@@ -163,8 +161,7 @@ contains ...@@ -163,8 +161,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&double& &double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
endif endif
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
...@@ -195,8 +192,6 @@ contains ...@@ -195,8 +192,6 @@ contains
use elpa_abstract_impl use elpa_abstract_impl
use, intrinsic :: iso_c_binding use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
...@@ -280,7 +275,7 @@ contains ...@@ -280,7 +275,7 @@ contains
&real& &real&
&_generic_& &_generic_&
&single& &single&
& (a(1:stripe_width,j+off+a_off-1:a_dim2,istripe),w, nbw, nl, stripe_width, nbw) & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
enddo enddo
...@@ -288,8 +283,7 @@ contains ...@@ -288,8 +283,7 @@ contains
&real& &real&
&_cpu_& &_cpu_&
&single& &single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,& & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
stripe_width)
endif endif
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik) kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
...@@ -401,7 +395,7 @@ contains ...@@ -401,7 +395,7 @@ contains
&complex& &complex&
&_generic_& &_generic_&
&double& &double&
& (a(1:stripe_width,j+off+a_off:a_dim2,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width) & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo enddo
endif endif
...@@ -514,7 +508,7 @@ contains ...@@ -514,7 +508,7 @@ contains
&complex& &complex&
&_generic_& &_generic_&
&single& &single&
& (a(1:stripe_width,j+off+a_off:a_dim2,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width) & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo enddo
endif endif
......
module single_hh_trafo_real
implicit none
public single_hh_trafo_real_cpu_double
public single_hh_trafo_real_cpu_single
contains
subroutine single_hh_trafo_real_cpu_double(q, hh, nb, nq, ldq)
use elpa_abstract_impl
use precision
! Perform single real Householder transformation.
! This routine is not performance critical and thus it is coded here in Fortran
implicit none
! class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: nb, nq, ldq
! real(kind=rk8), intent(inout) :: q(ldq, *)
! real(kind=rk8), intent(in) :: hh(*)
real(kind=rk8), intent(inout) :: q(1:ldq, 1:nb)
real(kind=rk8), intent(in) :: hh(1:nb)
integer(kind=ik) :: i
real(kind=rk8) :: v(nq)
!#ifdef WITH_OPENMP
! call obj%timer%start("single_hh_trafo_real_cpu_openmp_double")
!#else
! call obj%timer%start("single_hh_trafo_real_cpu_double")
!#endif
! v = q * hh
v(:) = q(1:nq,1)
do i