PDA

View Full Version : Chapter 1: Creating and distributing a matrix-vector multiplication kernel



Petr Schreiber
17-09-2012, 22:24
This is the adapted code from listing 1.1 in chapter 1 of the book OpenCL in Action.
The example basically performs this matrix multiplication:


| 0.0 2.0 4.0 6.0 | | 0.0 | | 84.0 |
| | | | | |
| 8.0 10.0 12.0 14.0 | | 3.0 | | 228.0 |
| | x | | = | |
| 16.0 18.0 20.0 22.0 | | 6.0 | | 372.0 |
| | | | | |
| 24.0 26.0 28.0 30.0 | | 9.0 | | 516.0 |


Comparing to the original example I added more informative text information to console output.



' -- NOTE: Enhanced to list the result/expected result

Uses "Console", "File"

#INCLUDE "%APP_INCLUDEPATH%/cl/cl.tBasicU"

Begin Const
$PROGRAM_FILE = APP_SourcePath+"matvec.cl"
$KERNEL_FUNC = "matvec_mult"
End Const

Function TBMain()

/* Host/device data structures */
tcl_platform_id platform
tcl_device_id device
tcl_context context
tcl_command_queue queue
tcl_int i, ErrCl

/* Program/kernel data structures */
tcl_program program

String program_buffer, program_log
tSize program_size, log_size
tcl_kernel kernel

/* Data and buffers */
Single mtx(16), vec(4), result(4)
Single correct(4) = 0.0, 0.0, 0.0, 0.0
tcl_mem mat_buff, vec_buff, res_buff
tSize work_units_per_kernel

/* Initialize data to be processed by the kernel */
For i = 1 To 16
mtx(i) = (i-1) * 2.0
Next

For i = 1 To 4
vec(i) = (i-1) * 3.0
correct(1) += mtx(i) * vec(i)
correct(2) += mtx(i+4) * vec(i)
correct(3) += mtx(i+8) * vec(i)
correct(4) += mtx(i+12) * vec(i)
Next

/* Identify a platform */
ErrCl = clGetPlatformIDs(1, platform, ByVal NULL)
If (ErrCl < 0) Then
perror("Couldn't find any platforms")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Access a device */
ErrCl = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, device, ByVal NULL)
If (ErrCl < 0) Then
perror("Couldn't find any devices")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Create the context */
context = clCreateContext(ByVal NULL, 1, device, ByVal NULL, ByVal NULL, ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create a context")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Read program file and place content into buffer */
program_buffer = FILE_Load($PROGRAM_FILE)
program_size = Len(program_buffer)
If (program_size = 0) Then
perror("Couldn't find the program file")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Create program from file */
program = clCreateProgramWithSource(context, 1, VarPtr(program_buffer), program_size, ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create the program")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If
program_buffer = ""

/* Build program */
ErrCl = clBuildProgram(program, 0, ByVal 0, "", 0, ByVal 0)
If (ErrCl < 0) Then

/* Find size of log and print to std output */
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
0, ByVal NULL, log_size)
program_log = Repeat$(log_size+1, $NUL)

clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
log_size + 1, ByVal StrPtr(program_log), ByVal NULL)
PrintL program_log
program_log = ""

APP_SetReturnCode(1) : waitkey : Exit Function
End If

/* Create kernel for the mat_vec_mult function */
kernel = clCreateKernel(program, StrPtr($KERNEL_FUNC), ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create the kernel")
APP_SetReturnCode(1) : waitkey : Exit Function
End If

/* Create CL buffers to hold input and output data */
mat_buff = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, SizeOf(Single)*16, mtx, ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create a buffer object")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

vec_buff = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, SizeOf(Single)*4, vec, ByVal NULL)
res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
SizeOf(Single)*4, ByVal NULL, ByVal NULL)

/* Create kernel arguments from the CL buffers */
ErrCl = clSetKernelArg(kernel, 0, SizeOf(tcl_mem), mat_buff)
If (ErrCl < 0) Then
perror("Couldn't set the kernel argument")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

clSetKernelArg(kernel, 1, SizeOf(tcl_mem), vec_buff)
clSetKernelArg(kernel, 2, SizeOf(tcl_mem), res_buff)

/* Create a CL command queue for the device*/
queue = clCreateCommandQueue(context, device, 0, ErrCl)
If (ErrCl < 0) Then
perror("Couldn't create the command queue")
APP_SetReturnCode(1) : waitkey : Exit Function
End If

/* Enqueue the command queue to the device */
work_units_per_kernel = 4 /* 4 work-units per kernel */
ErrCl = clEnqueueNDRangeKernel(queue, kernel, 1, ByVal NULL, work_units_per_kernel,
ByVal NULL, 0, ByVal NULL, ByVal NULL)
If (ErrCl < 0) Then
perror("Couldn't enqueue the kernel execution command")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Read the result */
ErrCl = clEnqueueReadBuffer(queue, res_buff, CL_TRUE, 0, SizeOf(Single)*4,
result, 0, ByVal NULL, ByVal NULL)

If (ErrCl < 0) Then
perror("Couldn't enqueue the read buffer command")
APP_SetReturnCode(1) : WaitKey : Exit Function
End If

/* Test the result */
If((result(1) = correct(1)) And (result(2) = correct(2)) And (result(3) = correct(3)) And (result(4) = correct(4))) Then

PrintL("Matrix-vector multiplication successful.")

Else

PrintL("Matrix-vector multiplication unsuccessful.")

End If
PrintL("Calculated result: " + Join$(result, ", "))
PrintL("Expected result : " + Join$(correct, ", "))

/* Deallocate resources */
clReleaseMemObject(mat_buff)
clReleaseMemObject(vec_buff)
clReleaseMemObject(res_buff)
clReleaseKernel(kernel)
clReleaseCommandQueue(queue)
clReleaseProgram(program)
clReleaseContext(context)

PrintL "Press any key to continue..."
WaitKey
APP_SetReturnCode(0)

End Function

Function pError( sError As String )

Console_SetTextAttribute(%CONSOLE_FOREGROUND_RED | %CONSOLE_FOREGROUND_INTENSITY)
PrintL sError
Console_SetTextAttribute(%CONSOLE_FOREGROUND_RED | %CONSOLE_FOREGROUND_GREEN | %CONSOLE_FOREGROUND_BLUE)

End Function


You will need the latest ThinBASIC and OpenCL headers (http://www.thinbasic.com/community/showthread.php?10159-OpenCL-Headers-Updated-Sep-15-2011) to run it + of course modern GPU or CPU (http://www.thinbasic.com/community/showthread.php?10161-OpenCL-Supported-hardware).


Enjoy!,
Petr

Petr Schreiber
18-09-2012, 15:24
Updated code on 18-9-2012, 15:22 CET - there was a typo in error handling of failed build, fixed :)

kryton9
19-09-2012, 07:49
Thanks for the first of hopefully many examples Petr. I looked over the code and it is daunting to use OpenCL. Goodluck as you make your way through the book and code, it will be a valuable resource for the rest of us, so thanks!

Petr Schreiber
19-09-2012, 08:33
The example in first chapter is to give overview of how the finished minimal example works. Lot of odd things to see, might even look confusing.
Examples since chapter 2 go right from the basics, I will start posting them soon. You will see it is nothing to be scared of :)


Petr

matthew
10-09-2013, 20:49
Finally had the chance to try this. I haven't got around to trying the other examples yet but I'm sure they'll work. :-D

http://is.gd/5goR1s

Petr Schreiber
11-09-2013, 08:18
Hi Matthew,

thanks for testing. Did you have to modify the headers in any way for your Radeon?


Petr

matthew
11-09-2013, 11:10
No not yet I haven't but this is the only example that I've tried so-far. Maybe when I try the other examples later I'll need to make some changes?

Petr Schreiber
11-09-2013, 11:30
Hopefully not,

OpenCL C is not device specific. My only worry was about name of DLL on ATi/AMD cards - in first releases it was atiocl.dll, while NVidia and Intel had OpenCL.DLL.
It seems Radeon guys have changed the naming convention, good for us :)


Petr