08-08-2019 12:35 AM - edited 08-08-2019 12:36 AM
Hello all,
I find that printf - kernel debugging seems to not work in codes that I write. I've checked that this feature works in the provided SDAccel example.
(not work = nothing changes, as if the printf was commented out)
Due to specific circumstances, I am unable to use HW-emulation and SW-emulation so I can't test the printf feature at emulation. (And the inability to use emulation makes the necessity of a properly functioning printf more important)
I believe this is a problem of my kernel code. Any help would be appreciated There is no difference on my compiling environment between my code and the SDAccel example code, and I've checked that there are no differences in the makefiles except for the kernel names.
And also, if someone could give me insight into a better kernel coding style, would be much more appreciated.
I'm currently using SDaccel 18.2 and Ubuntu 16.04.6
//#include <stdio.h> #define BLOCK_SIZE 128 #define BLOCK_COUNT 1 #define SMALL_BLOCK_SIZE 8 #define AXI_WIDTY 512 /* This is a 128 * 128 matrix multiplication */ /* Alligned with 512 bit */ void read(global uint16 *in, uint16 *buffer_in){ __attribute__((xcl_pipeline_loop)) for (int i = 0; i < SMALL_BLOCK_SIZE; i++){ buffer_in[i] = in[i]; } } void write(global uint16 *out, uint16 *buffer_out){ __attribute__((xcl_pipeline_loop)) for (int i = 0; i < SMALL_BLOCK_SIZE; i++){ buffer_out[i] = out[i]; } } void mult(int* local_in1, int* local_in2, int* local_out) { //Reads the input_data from local memory, performs the computations //and writes the data to local memory for(int i = 0; i < BLOCK_SIZE; i++){ for(int j = 0; j < BLOCK_SIZE; j++){ local_out[i * BLOCK_SIZE + j] = 0; write_data: for(int k = 0; k < BLOCK_SIZE; k++){ local_out[i * BLOCK_SIZE + j] += local_in1[i * BLOCK_SIZE + k] * local_in2[k * BLOCK_SIZE + j]; } } } } kernel __attribute__((reqd_work_group_size(1, 1, 1))) void mmult( global uint16* in1, //Read-only input matrix1 global uint16* in2, //Read-only input matrix2 global uint16* out, //Output matrix const int first_block_i, const int first_block_j, const int second_block_i, const int second_block_j, const int dim //One dimension of the matrix //1024 ) { //int dim = *size; //Local memory to store input matrices //Local memory is implemented as BRAM memory blocks uint16 local_in1[BLOCK_SIZE][BLOCK_SIZE]; uint16 local_in2[BLOCK_SIZE][BLOCK_SIZE]; uint16 local_out[BLOCK_SIZE][BLOCK_SIZE]; const int first_start = BLOCK_COUNT * BLOCK_SIZE * BLOCK_SIZE * first_block_i + BLOCK_SIZE * first_block_j; const int second_start = BLOCK_COUNT * BLOCK_SIZE * BLOCK_SIZE * second_block_i + BLOCK_SIZE * second_block_j; const int out_start = BLOCK_COUNT * BLOCK_SIZE * BLOCK_SIZE * first_block_i + BLOCK_SIZE * second_block_j; int offset = 0; //__attribute__ ((xcl_dataflow)) for (int i = 0; i < BLOCK_SIZE; i++, offset += SMALL_BLOCK_SIZE * BLOCK_COUNT){ read(in1 + first_start + offset, local_in1 + offset); read(in2 + second_start + offset, local_in2 + offset); } printf("read is %d, %d, %d, %d\n", local_in1[0], local_in1[1], local_in1[2], local_in1[3]); mult(local_in1, local_in2, local_out); int noffset = 0; for (int i = 0; i < BLOCK_SIZE; i++, noffset += SMALL_BLOCK_SIZE * BLOCK_COUNT){ write(out + out_start + noffset, local_out + noffset); } }
08-09-2019 05:32 AM
Are you going to print something like following?
printf("read is %u, %u, %u, %u\n", local_in1[0][0], local_in1[1][0], local_in1[2][0], local_in1[3][0]);
Please refer to https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/printfFunction.html for printf usage.
08-09-2019 05:32 AM
Are you going to print something like following?
printf("read is %u, %u, %u, %u\n", local_in1[0][0], local_in1[1][0], local_in1[2][0], local_in1[3][0]);
Please refer to https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/printfFunction.html for printf usage.
08-11-2019 01:43 AM
Such a simple, simple error....