Commit 4ca77db1 authored by Victor Yu's avatar Victor Yu

Stop ifort from creating temp array in ELPA2 kernel

parent 9ed1e6ab
# ELSI changelog
## v2.6.1 (June 2020)
## Not released
### PEXSI
* Removed an improper abort from the error handling code of PEXSI.
### ELPA
* Fix a performance regression of the ELPA2 generic kernel.
### Known issues
* The ELPA code can not be compiled with the NAG Fortran compiler, due to the
......@@ -13,6 +13,11 @@
* Depending on the choice of k-points, the complex PEXSI solver may randomly
fail at the inertia counting stage.
## v2.6.1 (June 2020)
### PEXSI
* Removed an improper abort from the error handling code of PEXSI.
## v2.6.0 (June 2020)
### ELSI interface
......
......@@ -7,7 +7,7 @@ SET(elsi_URL "http://elsi-interchange.org")
SET(elsi_EMAIL "elsi-team@duke.edu")
SET(elsi_LICENSE "BSD 3")
SET(elsi_DESCRIPTION "Electronic Structure Infrastructure")
SET(elsi_DATESTAMP "20200629")
SET(elsi_DATESTAMP "20200718")
### CMake modules ###
LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
......
......@@ -29,8 +29,9 @@ LIST(APPEND elpa_src
src/mod_precision.f90
src/mod_redist_band.f90
src/mod_scalapack_interfaces.f90
src/mod_single_hh_trafo_real.f90
src/resident_set_size.c
src/single_hh_trafo_real_dp.f90
src/single_hh_trafo_real_sp.f90
src/time.c
src/virtual_memory.c)
......
......@@ -3683,7 +3683,7 @@ contains
integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:)
integer(kind=BLAS_KIND), allocatable :: iwork(:)
logical, intent(in) :: wantDebug
logical, intent(out) :: success
......@@ -3709,7 +3709,7 @@ contains
call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage)
call obj%timer%start("blas")
call DSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), iwork, int(liwork,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
......@@ -4275,7 +4275,7 @@ contains
integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:)
integer(kind=BLAS_KIND), allocatable :: iwork(:)
logical, intent(in) :: wantDebug
logical, intent(out) :: success
......@@ -4301,7 +4301,7 @@ contains
call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage)
call obj%timer%start("blas")
call DSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), iwork, int(liwork,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
......@@ -7546,7 +7546,7 @@ contains
integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:)
integer(kind=BLAS_KIND), allocatable :: iwork(:)
logical, intent(in) :: wantDebug
logical, intent(out) :: success
......@@ -7572,7 +7572,7 @@ contains
call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage)
call obj%timer%start("blas")
call SSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), iwork, int(liwork,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
......@@ -8138,7 +8138,7 @@ contains
integer(kind=ik) :: i, j, lwork, liwork, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik), allocatable :: iwork(:)
integer(kind=BLAS_KIND), allocatable :: iwork(:)
logical, intent(in) :: wantDebug
logical, intent(out) :: success
......@@ -8164,7 +8164,7 @@ contains
call check_allocate_f("solve_tridi_single: work, iwork", 603, istat, errorMessage)
call obj%timer%start("blas")
call SSTEDC('I', int(nlen,kind=BLAS_KIND), d, e, q, int(ldq,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), int(iwork,kind=BLAS_KIND), int(liwork,kind=BLAS_KIND), &
work, int(lwork,kind=BLAS_KIND), iwork, int(liwork,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
......
......@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -132,7 +130,7 @@ contains
&real&
&_generic_&
&double&
& (a(1:stripe_width,j+off+a_off-1:a_dim2,istripe),w, nbw, nl, stripe_width, nbw)
& (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
enddo
......@@ -140,8 +138,7 @@ contains
&real&
&_cpu_&
&double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt
......@@ -171,8 +168,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -233,7 +228,7 @@ contains
&real&
&_generic_&
&single&
& (a(1:stripe_width,j+off+a_off-1:a_dim2,istripe),w, nbw, nl, stripe_width, nbw)
& (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
enddo
......@@ -241,8 +236,7 @@ contains
&real&
&_cpu_&
&single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt
......@@ -330,7 +324,7 @@ contains
&complex&
&_generic_&
&double&
& (a(1:stripe_width,j+off+a_off:a_dim2,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo
......@@ -413,7 +407,7 @@ contains
&complex&
&_generic_&
&single&
& (a(1:stripe_width,j+off+a_off:a_dim2,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo
......
......@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -139,8 +137,7 @@ contains
&real&
&_cpu_&
&double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt
......@@ -170,8 +167,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -239,8 +234,7 @@ contains
&real&
&_cpu_&
&single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt
......
......@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -139,8 +137,7 @@ contains
&real&
&_cpu_&
&double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt
......@@ -170,8 +167,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -239,8 +234,7 @@ contains
&real&
&_cpu_&
&single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt
......
......@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -139,8 +137,7 @@ contains
&real&
&_cpu_&
&double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt
......@@ -170,8 +167,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -239,8 +234,7 @@ contains
&real&
&_cpu_&
&single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
kernel_time = kernel_time + mpi_wtime()-ttt
......
......@@ -70,8 +70,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -155,7 +153,7 @@ contains
&real&
&_generic_&
&double&
& (a(1:stripe_width,j+off+a_off-1:a_dim2,istripe),w, nbw, nl, stripe_width, nbw)
& (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
enddo
......@@ -163,8 +161,7 @@ contains
&real&
&_cpu_&
&double&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
endif
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
......@@ -195,8 +192,6 @@ contains
use elpa_abstract_impl
use, intrinsic :: iso_c_binding
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
......@@ -280,7 +275,7 @@ contains
&real&
&_generic_&
&single&
& (a(1:stripe_width,j+off+a_off-1:a_dim2,istripe),w, nbw, nl, stripe_width, nbw)
& (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
enddo
......@@ -288,8 +283,7 @@ contains
&real&
&_cpu_&
&single&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl,&
stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
endif
kernel_flops = kernel_flops + 4*int(nl,lik)*int(ncols,lik)*int(nbw,lik)
......@@ -401,7 +395,7 @@ contains
&complex&
&_generic_&
&double&
& (a(1:stripe_width,j+off+a_off:a_dim2,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo
endif
......@@ -514,7 +508,7 @@ contains
&complex&
&_generic_&
&single&
& (a(1:stripe_width,j+off+a_off:a_dim2,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
& (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
enddo
endif
......
module single_hh_trafo_real
implicit none
public single_hh_trafo_real_cpu_double
public single_hh_trafo_real_cpu_single
contains
subroutine single_hh_trafo_real_cpu_double(q, hh, nb, nq, ldq)
use elpa_abstract_impl
use precision
! Perform single real Householder transformation.
! This routine is not performance critical and thus it is coded here in Fortran
implicit none
! class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: nb, nq, ldq
! real(kind=rk8), intent(inout) :: q(ldq, *)
! real(kind=rk8), intent(in) :: hh(*)
real(kind=rk8), intent(inout) :: q(1:ldq, 1:nb)
real(kind=rk8), intent(in) :: hh(1:nb)
integer(kind=ik) :: i
real(kind=rk8) :: v(nq)
!#ifdef WITH_OPENMP
! call obj%timer%start("single_hh_trafo_real_cpu_openmp_double")
!#else
! call obj%timer%start("single_hh_trafo_real_cpu_double")
!#endif
! v = q * hh
v(:) = q(1:nq,1)
do i=2,nb
v(:) = v(:) + q(1:nq,i) * hh(i)
enddo
! v = v * tau
v(:) = v(:) * hh(1)
! q = q - v * hh**T
q(1:nq,1) = q(1:nq,1) - v(:)
do i=2,nb
q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
enddo
!#ifdef WITH_OPENMP
! call obj%timer%stop("single_hh_trafo_real_cpu_openmp_double")
!#else
! call obj%timer%stop("single_hh_trafo_real_cpu_double")
!#endif
end subroutine
! single precision implementation at the moment duplicated !!!
subroutine single_hh_trafo_real_cpu_single(q, hh, nb, nq, ldq)
use elpa_abstract_impl
use precision
! Perform single real Householder transformation.
! This routine is not performance critical and thus it is coded here in Fortran
implicit none
!class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: nb, nq, ldq
! real(kind=rk4), intent(inout) :: q(ldq, *)
! real(kind=rk4), intent(in) :: hh(*)
real(kind=rk4), intent(inout) :: q(1:ldq, 1:nb)
real(kind=rk4), intent(in) :: hh(1:nb)
integer(kind=ik) :: i
real(kind=rk4) :: v(nq)
!#ifdef WITH_OPENMP
! call obj%timer%start("single_hh_trafo_real_cpu_openmp_single")
!#else
! call obj%timer%start("single_hh_trafo_real_cpu_single")
!#endif
! v = q * hh
v(:) = q(1:nq,1)
do i=2,nb
v(:) = v(:) + q(1:nq,i) * hh(i)
enddo
! v = v * tau
v(:) = v(:) * hh(1)
! q = q - v * hh**T
q(1:nq,1) = q(1:nq,1) - v(:)
do i=2,nb
q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
enddo
!#ifdef WITH_OPENMP
! call obj%timer%stop("single_hh_trafo_real_cpu_openmp_single")
!#else
! call obj%timer%stop("single_hh_trafo_real_cpu_single")
!#endif
end subroutine
end module
subroutine single_hh_trafo_real_cpu_double(q, hh, nb, nq, ldq)
use elpa_abstract_impl
use precision
! Perform single real Householder transformation.
! This routine is not performance critical and thus it is coded here in Fortran
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq
real(kind=rk8), intent(inout) :: q(ldq, *)
real(kind=rk8), intent(in) :: hh(*)
integer(kind=ik) :: i
real(kind=rk8) :: v(nq)
! v = q * hh
v(:) = q(1:nq,1)
do i=2,nb
v(:) = v(:) + q(1:nq,i) * hh(i)
enddo
! v = v * tau
v(:) = v(:) * hh(1)
! q = q - v * hh**T
q(1:nq,1) = q(1:nq,1) - v(:)
do i=2,nb
q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
enddo
end subroutine
subroutine single_hh_trafo_real_cpu_single(q, hh, nb, nq, ldq)
use elpa_abstract_impl
use precision
! Perform single real Householder transformation.
! This routine is not performance critical and thus it is coded here in Fortran
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq
real(kind=rk4), intent(inout) :: q(ldq, *)
real(kind=rk4), intent(in) :: hh(*)
integer(kind=ik) :: i
real(kind=rk4) :: v(nq)
! v = q * hh
v(:) = q(1:nq,1)
do i=2,nb
v(:) = v(:) + q(1:nq,i) * hh(i)
enddo
! v = v * tau
v(:) = v(:) * hh(1)
! q = q - v * hh**T
q(1:nq,1) = q(1:nq,1) - v(:)
do i=2,nb
q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
enddo
end subroutine
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment