Chapter 1: Creating and distributing a matrix-vector multiplication kernel

**Petr Schreiber** · 17-09-2012, 22:24

This is the adapted code from listing 1.1 in chapter 1 of the book OpenCL in Action.
The example basically performs this matrix multiplication:

|  0.0   2.0   4.0   6.0 |   | 0.0 |   | 84.0  |
|                        |   |     |   |       |
|  8.0  10.0  12.0  14.0 |   | 3.0 |   | 228.0 | 
|                        | x |     | = |       |
| 16.0  18.0  20.0  22.0 |   | 6.0 |   | 372.0 |
|                        |   |     |   |       |
| 24.0  26.0  28.0  30.0 |   | 9.0 |   | 516.0 |

Comparing to the original example I added more informative text information to console output.

' -- NOTE: Enhanced to list the result/expected result

Uses "Console", "File"

#INCLUDE "%APP_INCLUDEPATH%/cl/cl.tBasicU"

Begin Const
  $PROGRAM_FILE = APP_SourcePath+"matvec.cl"
  $KERNEL_FUNC  = "matvec_mult"
End Const

Function TBMain()

   /* Host/device data structures */
   tcl_platform_id   platform
   tcl_device_id     device
   tcl_context       context
   tcl_command_queue queue
   tcl_int           i, ErrCl

   /* Program/kernel data structures */
   tcl_program program
   
   String program_buffer, program_log
   tSize program_size, log_size
   tcl_kernel kernel
   
   /* Data and buffers */
   Single mtx(16), vec(4), result(4)
   Single correct(4) = 0.0, 0.0, 0.0, 0.0
   tcl_mem mat_buff, vec_buff, res_buff
   tSize work_units_per_kernel

   /* Initialize data to be processed by the kernel */
   For i = 1 To 16
      mtx(i) = (i-1) * 2.0
   Next
   
   For i = 1 To 4
      vec(i) = (i-1) * 3.0
      correct(1) += mtx(i)    * vec(i)
      correct(2) += mtx(i+4)  * vec(i)
      correct(3) += mtx(i+8)  * vec(i)
      correct(4) += mtx(i+12) * vec(i)      
   Next

   /* Identify a platform */
   ErrCl = clGetPlatformIDs(1, platform, ByVal NULL)
   If (ErrCl < 0) Then
      perror("Couldn't find any platforms")
      APP_SetReturnCode(1) : WaitKey : Exit Function
   End If

   /* Access a device */
   ErrCl = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, device, ByVal NULL)
   If (ErrCl < 0) Then
      perror("Couldn't find any devices")
      APP_SetReturnCode(1) : WaitKey : Exit Function
   End If

   /* Create the context */
   context = clCreateContext(ByVal NULL, 1, device, ByVal NULL, ByVal NULL, ErrCl)
   If (ErrCl < 0) Then
      perror("Couldn't create a context")
      APP_SetReturnCode(1) : WaitKey : Exit Function
   End If

   /* Read program file and place content into buffer */     
   program_buffer = FILE_Load($PROGRAM_FILE)   
   program_size   = Len(program_buffer)
   If (program_size = 0) Then
      perror("Couldn't find the program file")
      APP_SetReturnCode(1) : WaitKey : Exit Function 
   End If

   /* Create program from file */   
   program = clCreateProgramWithSource(context, 1, VarPtr(program_buffer), program_size, ErrCl)
   If (ErrCl < 0) Then
      perror("Couldn't create the program")
      APP_SetReturnCode(1) : WaitKey : Exit Function 
   End If
   program_buffer = ""  
   
   /* Build program */
   ErrCl = clBuildProgram(program, 0, ByVal 0, "", 0, ByVal 0)
   If (ErrCl < 0) Then

      /* Find size of log and print to std output */
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            0, ByVal NULL, log_size)
      program_log = Repeat$(log_size+1, $NUL)
      
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
            log_size + 1, ByVal StrPtr(program_log), ByVal NULL)
      PrintL program_log
      program_log = ""  
      
      APP_SetReturnCode(1) : waitkey : Exit Function 
   End If

   /* Create kernel for the mat_vec_mult function */
   kernel = clCreateKernel(program, StrPtr($KERNEL_FUNC), ErrCl)
   If (ErrCl < 0) Then
      perror("Couldn't create the kernel")
      APP_SetReturnCode(1) : waitkey : Exit Function
   End If

   /* Create CL buffers to hold input and output data */
   mat_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | 
      CL_MEM_COPY_HOST_PTR, SizeOf(Single)*16, mtx, ErrCl)
   If (ErrCl < 0) Then
      perror("Couldn't create a buffer object")
      APP_SetReturnCode(1) : WaitKey : Exit Function
   End If
   
   vec_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | 
      CL_MEM_COPY_HOST_PTR, SizeOf(Single)*4, vec, ByVal NULL)
   res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
      SizeOf(Single)*4, ByVal NULL, ByVal NULL)

   /* Create kernel arguments from the CL buffers */
   ErrCl = clSetKernelArg(kernel, 0, SizeOf(tcl_mem), mat_buff)
   If (ErrCl < 0) Then
      perror("Couldn't set the kernel argument")
      APP_SetReturnCode(1) : WaitKey : Exit Function
   End If
   
   clSetKernelArg(kernel, 1, SizeOf(tcl_mem), vec_buff)
   clSetKernelArg(kernel, 2, SizeOf(tcl_mem), res_buff)

   /* Create a CL command queue for the device*/
   queue = clCreateCommandQueue(context, device, 0, ErrCl)
   If (ErrCl < 0) Then
      perror("Couldn't create the command queue")
      APP_SetReturnCode(1) : waitkey : Exit Function
   End If

   /* Enqueue the command queue to the device */
   work_units_per_kernel = 4 /* 4 work-units per kernel */ 
   ErrCl = clEnqueueNDRangeKernel(queue, kernel, 1, ByVal NULL, work_units_per_kernel, 
      ByVal NULL, 0, ByVal NULL, ByVal NULL)
   If (ErrCl < 0) Then
      perror("Couldn't enqueue the kernel execution command")
      APP_SetReturnCode(1) : WaitKey : Exit Function
   End If

   /* Read the result */
   ErrCl = clEnqueueReadBuffer(queue, res_buff, CL_TRUE, 0, SizeOf(Single)*4, 
      result, 0, ByVal NULL, ByVal NULL)
      
   If (ErrCl < 0) Then
      perror("Couldn't enqueue the read buffer command")
      APP_SetReturnCode(1) : WaitKey : Exit Function
   End If

   /* Test the result */
   If((result(1) = correct(1)) And (result(2) = correct(2)) And (result(3) = correct(3)) And (result(4) = correct(4))) Then
       
      PrintL("Matrix-vector multiplication successful.")
   
   Else 
   
      PrintL("Matrix-vector multiplication unsuccessful.")
      
   End If
   PrintL("Calculated result: " + Join$(result, ", "))
   PrintL("Expected result  : " + Join$(correct, ", "))
   
   /* Deallocate resources */
   clReleaseMemObject(mat_buff)
   clReleaseMemObject(vec_buff)
   clReleaseMemObject(res_buff)
   clReleaseKernel(kernel)
   clReleaseCommandQueue(queue)
   clReleaseProgram(program)
   clReleaseContext(context)

   PrintL "Press any key to continue..."   
   WaitKey
   APP_SetReturnCode(0)                            

End Function

Function pError( sError As String )

  Console_SetTextAttribute(%CONSOLE_FOREGROUND_RED | %CONSOLE_FOREGROUND_INTENSITY)
  PrintL sError
  Console_SetTextAttribute(%CONSOLE_FOREGROUND_RED | %CONSOLE_FOREGROUND_GREEN | %CONSOLE_FOREGROUND_BLUE)
  
End Function

You will need the latest ThinBASIC and OpenCL headers to run it + of course modern GPU or CPU.

Enjoy!,
Petr