      program multiplication

      integer, parameter :: size= 2000
      external:: dgemm        ! subroutines from BLAS
      real, dimension(:,:), allocatable :: A,B,C,D
      real :: t1, t2, t3, tt, mflops, trace, diff
      integer :: n, loops, i,j,l, case
      
      allocate( A(size, size), B(size, size), C(size,size), D(size,size) )

      n = size
      loops = 1

      do i = 1, n
         do j = 1, n
         !do j = max(i-1, 1), min(i+1, n)
            A(i,j) = i+j
            B(i,j) = i-j
         end do
      end do

      !write(*,*)'--------------------------------------'
      !   do i=1, n
      !      write(*,'(i6, 300es12.4)') i, A(i,:)
      !   enddo
      !   write(*,*)'--------------------------------------'
      !   do i=1, n
      !      write(*,'(i6, 300es12.4)') i, B(i,:)
      !   enddo
      !   print*
         
      ! case = 0   sparse matrix
      ! case = 1   "standard algorithm"
      ! case = 2   using cache memory
      ! case = 3   Fortran90 
       case = 4    ! LAPACK
      do case = 2, 4
      
         call cpu_time( t1 )

         do l = 1, loops
            if(case == 0) then 
               D(:,:) = 0.
               do i = 1, n
                  do j = max(i-2, 1), min(i+2, n)
                     do k = max(min(i-1, j-1),1), min( max(i+1, j+1), n)
                        D(i,j) = D(i,j) + A(i,k)*B(k,j)
                     end do
                  end do
               end do
               
            elseif(case == 1) then 
               do i = 1, n
                  do j = 1, n
                     C(i,j) = 0
                     do k = 1, n
                        C(i,j) = C(i,j) + A(i,k)*B(k,j)
                     end do
                  end do
               end do

            elseif( case == 2) then 
               call BlockMulti(n, A, B, C)

            elseif( case == 3) then 
               C(1:n, 1:n) = matmul(A(1:n, 1:n), B(1:n, 1:n) )

            elseif( case == 4) then 
               D = 0.
               call DGEMM('N', 'N', n, n, n, 1., A(1:n, 1:n) , n, B(1:n, 1:n) , n, 1., D(1:n,1:n), n)
            endif

         end do  ! do l=1,loops
         
         
         call cpu_time( t2 )


         tt = (t2 - t1 ) / loops
         
         if(tt > 0) then
            mflops = 2*real(n)**3/tt/1.e6
            
            write(*,'(a10,i5, a10, i8, 2(a10, es14.6))' )  &
              "CASE:", case, " size = ", n, "time = ", tt, "  mflops = ", mflops

            ! verification of the vanishing traces
            trace = 0
            do i = 1, size
               trace = trace + C(i,i)
            end do
            if(trace /= 0.) write(*,*) "trace(A*B)=",trace
            
         else
            print*,'TOO short time'
         endif

         
      enddo  ! do case = 1, 3

      !write(*,*)'--------------------------------------'
      !do i=1, n
      !   write(*,'(i6, 300es12.4)') i, D(i,:)
      !enddo
      !write(*,*)'--------------------------------------'
      !do i=1, n
      !   write(*,'(i6, 300es12.4)') i, C(i,:)
      !enddo
      !write(*,*)'--------------------------------------'

      diff = 0.
      do i=1, n
         !write(*,'(i6, 300es12.4)') i, C(i,:)-D(i,:)
         diff = diff + dot_product(C(i,:)- D(i,:), C(i,:)- D(i,:))
      enddo

      print*,'Difference = ', diff


      
      ! do i = 1, n
      !    write(*,"(10F7.0)") (A(i,j),j=1,n)
      ! end do

      ! do i = 1, n
      !    write(*,"(10F7.0)") (B(i,j),j=1,n)
      ! end do

      ! do i = 1, n
      !    write(*,"(10F7.0)") (C(i,j),j=1,n)
      ! end do


      deallocate(A, B, C)

    end program Multiplication




    
    subroutine BlockMulti(n, A, B, C )
      integer, intent(in) ::  n
      !real :: A(lda,*), B(ldb,*), C(ldc,*)
      real, dimension(1:n, 1:n), intent(inout) :: A, B, C 
 
      integer,parameter ::  BSIZE = 40
      real, dimension(:,:), allocatable :: AT_B, B_B, C_B
      integer :: i, j, isize, jsize, ib, jb, l, lb
      
      allocate( AT_B(1:BSIZE, 1:BSIZE), B_B(1:BSIZE, 1:BSIZE), C_B(1:BSIZE, 1:BSIZE) )

      do i = 1, n, BSIZE
         isize = min(BSIZE, n-i+1)

         do j = 1, n, BSIZE
            jsize = min(BSIZE, n-j+1)

            do ib = 1, isize
               do jb = 1, jsize
                  C_B(ib,jb) = 0
               end do
            end do

            do l = 1, n, BSIZE
               lsize = min(BSIZE, n-l+1)

               do ib = 1, isize
                  do lb = 1, lsize
                     AT_B(lb,ib) = A(i+ib-1,l+lb-1)
                  end do
               end do

               do lb = 1, lsize
                  do jb = 1, jsize
                     B_B(lb,jb) = B(l+lb-1,j+jb-1)
                  end do
               end do

               !print*,'sizes C:', i , j, isize, jsize, lsize

               call BlockMulti_Loc(isize,lsize,jsize, AT_B, BSIZE, B_B, BSIZE, C_B, BSIZE)
            end do

            !print*,'sizes X:', i , j, isize, jsize

            do ib = 1, isize
               do jb = 1, jsize
                  C(i+ib-1,j+jb-1) = C_B(ib,jb)
               end do
            end do

         end do
      end do

      deallocate( AT_B, B_B,C_B )
    end subroutine BlockMulti


      subroutine BlockMulti_Loc(m, n, k, A, lda, B, ldb, C, ldc)
      integer m, n, k, lda, ldb, ldc
      real ::  A(lda,lda), B(ldb,ldb), C(ldc,ldb)
      
      do i = 1, m
         do j = 1, k
            do l = 1, n
               C(i,j) = C(i,j) + A(l,i)*B(l,j)
            end do
         end do
      end do

    end subroutine BlockMulti_Loc

 
