Using scripts to generate specialized functions

In this example we write a Python script power.py that will generate specialized functions to calculate the n-th power of x.

#!/usr/bin/python

POWER_MAX = 20

def compute_x_prod(n,d):
  if n == 0:
    d[0] = None
    return d
  if n == 1:
    d[1] = None
    return d
  if n in d:
    return d
  m = n/2
  d = compute_x_prod(m,d)
  d[n] = None
  d[2*m] = None
  return d

def print_subroutine(n):
  keys = compute_x_prod(n,{}).keys()
  keys.sort()
  output = []
  print "double precision function power_%d(x1)"%n
  print " double precision, intent(in) :: x1"
  print " BEGIN_DOC"
  print "!  Fast computation of x**%d"%(n)
  print " END_DOC"
  for i in range(1,len(keys)):
     output.append( "x%d"%keys[i] )
  if output != []:
    print " double precision :: "+', '.join(output)
  for i in range(1,len(keys)):
   ki = keys[i]
   ki1 = keys[i-1]
   if ki == 2*ki1:
     print " x%d"%ki + " = x%d * x%d"%(ki1,ki1)
   else:
     print " x%d"%ki + " = x%d * x1"%(ki1)
  print " power_%d = x%d"%(n,n)
  print "end"

for i in range(POWER_MAX):
  print_subroutine (i+1)
  print ''

Executing this script gives

double precision function power_1(x1)
 double precision, intent(in) :: x1
 BEGIN_DOC
!  Fast computation of x**1
 END_DOC
 power_1 = x1
end

double precision function power_2(x1)
 double precision, intent(in) :: x1
 BEGIN_DOC
!  Fast computation of x**2
 END_DOC
 double precision :: x2
 x2 = x1 * x1
 power_2 = x2
end

double precision function power_3(x1)
 double precision, intent(in) :: x1
 BEGIN_DOC
!  Fast computation of x**3
 END_DOC
 double precision :: x2, x3
 x2 = x1 * x1
 x3 = x2 * x1
 power_3 = x3
end

...


double precision function power_17(x1)
 double precision, intent(in) :: x1
 BEGIN_DOC
!  Fast computation of x**17
 END_DOC
 double precision :: x2, x4, x8, x16, x17
 x2 = x1 * x1
 x4 = x2 * x2
 x8 = x4 * x4
 x16 = x8 * x8
 x17 = x16 * x1
 power_17 = x17
end

double precision function power_18(x1)
 double precision, intent(in) :: x1
 BEGIN_DOC
!  Fast computation of x**18
 END_DOC
 double precision :: x2, x4, x8, x9, x18
 x2 = x1 * x1
 x4 = x2 * x2
 x8 = x4 * x4
 x9 = x8 * x1
 x18 = x9 * x9
 power_18 = x18
end

double precision function power_19(x1)
 double precision, intent(in) :: x1
 BEGIN_DOC
!  Fast computation of x**19
 END_DOC
 double precision :: x2, x4, x8, x9, x18, x19
 x2 = x1 * x1
 x4 = x2 * x2
 x8 = x4 * x4
 x9 = x8 * x1
 x18 = x9 * x9
 x19 = x18 * x1
 power_19 = x19
end

double precision function power_20(x1)
 double precision, intent(in) :: x1
 BEGIN_DOC
!  Fast computation of x**20
 END_DOC
 double precision :: x2, x4, x5, x10, x20
 x2 = x1 * x1
 x4 = x2 * x2
 x5 = x4 * x1
 x10 = x5 * x5
 x20 = x10 * x10
 power_20 = x20
end

Then, we create a benchmark.irp.f file that contains provider to compute the 20 first n-th power of x using the traditional x**n, and another provider that will use our specialized functions

BEGIN_PROVIDER [ double precision, x ]
  implicit none
  BEGIN_DOC
! Value of x
  END_DOC
  x = 1.2345d0
END_PROVIDER

BEGIN_PROVIDER [ double precision, x_p, (20) ]
  implicit none
  BEGIN_DOC
! array of x**p for 0 < p < 21 with the standard power functions
  END_DOC
  integer :: i
  do i=1,20
    x_p(i) = x**i
  enddo

END_PROVIDER

! Put the power.py script here for better inlining of the functions
BEGIN_SHELL [ /usr/bin/python ]
import power
END_SHELL

BEGIN_PROVIDER [ double precision, x_p_fast, (20) ]
  implicit none
  BEGIN_DOC
! array of x**p for 0 < p < 21 with the fast power functions
  END_DOC
  BEGIN_SHELL [ /bin/bash ]
  for i in {1..20}
  do
    echo "  double precision, external :: power_$i"
    echo "  !DIR$ FORCEINLINE"
    echo "  x_p_fast($i) = power_$i(x)"
  done
  END_SHELL

END_PROVIDER

We now Create a codelet for the x_p and the x_p_fast providers using

$ irpf90 -c      x_p:100000000
$ irpf90 -c x_p_fast:100000000
$ make

We easily see that we get a speedup of 12x with the specialized power routines:

$ ./codelet_x_p
 x_p
 -----------
 Cycles:
   261.97605379999999     
 Seconds:
  1.13999589999999997E-007

$ ./codelet_x_p_fast 
 x_p_fast
 -----------
 Cycles:
   21.707082620000001     
 Seconds:
  9.47659000000000108E-009