cancel
Showing results for 
Show  only  | Search instead for 
Did you mean: 
kazum
Visitor
Visitor
1,128 Views
Registered: ‎06-13-2018

clEnqueueWriteBuffer doesn't increase reference counts

I tried the following code on an AWS F1 instance (FPGA developer AMI 1.5.0, Xilinx SDx 2018.2), and then I noticed that clEnqueueWriteBuffer didn't increase a reference count.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <CL/cl.h>

#define ARRAY_SIZE	(128)
#define ALIGN		(16)

char *array_a;

static void check_status(cl_int err, const char *api);

int main()
{
	cl_int status;
	cl_platform_id platform_id;
	cl_uint num_platforms;
	status = clGetPlatformIDs(1, &platform_id, &num_platforms);
	check_status(status, "clGetPlatformIDs");

	cl_device_id device_id;
	cl_uint num_devices;
	status = clGetDeviceIDs(platform_id,
				CL_DEVICE_TYPE_ACCELERATOR,
				1,
				&device_id,
				&num_devices);
	check_status(status, "clGetDeviceIDs");

	cl_context context = clCreateContext(NULL,
					     1,
					     &device_id,
					     NULL,
					     NULL,
					     &status);
	check_status(status, "clCreateContext");

	cl_command_queue command_queue = clCreateCommandQueue(context,
							      device_id,
							      0,
							      &status);
	check_status(status, "clCreateCommandQueue");

	if (posix_memalign((void **)&array_a, ALIGN, ARRAY_SIZE * sizeof(cl_float)) != 0) {
		fprintf(stderr, "Could not allocate memory.\n");
		exit(1);
	}
	strcpy(array_a, "hello, world!");

	cl_mem mem = clCreateBuffer(context,
			     CL_MEM_READ_WRITE,
			     ARRAY_SIZE * sizeof(cl_float),
			     NULL,
			     &status);
	check_status(status, "clCreateBuffer b");

	cl_uint refcnt;
	status = clGetMemObjectInfo(mem, CL_MEM_REFERENCE_COUNT,
				      sizeof(cl_uint), &refcnt, NULL);
	check_status(status, "clGetMemObjectInfo");
	printf("cnt %d\n", refcnt);

	status = clEnqueueWriteBuffer(
		command_queue,
		mem,
		CL_FALSE, 0, ARRAY_SIZE * sizeof(cl_float),
		array_a,
		0, NULL, NULL);
	check_status(status, "opencl call");

	status = clGetMemObjectInfo(mem, CL_MEM_REFERENCE_COUNT,
				      sizeof(cl_uint), &refcnt, NULL);
	check_status(status, "clGetMemObjectInfo");
	printf("cnt %d\n", refcnt);

	free(array_a);
	status = clReleaseMemObject(mem);
	check_status(status, "clReleaseMemObject");
	status = clReleaseCommandQueue(command_queue);
	check_status(status, "clReleaseCommandQueue");
	status = clReleaseContext(context);
	check_status(status, "clReleaseContext");

	return 0;
}

static void check_status(cl_int err, const char *api)
{
	if (err == CL_SUCCESS)
		return;

	fprintf(stderr, "API %s error: %d\n", api, err);
	exit(1);
}

The output was as follows:

$ export XCL_EMULATION_MODE=sw_emu
$ gcc -g -O3 -Wall main.c -lOpenCL
$ ./a.out
ERROR: xclProbe-scan failed at fpga_pci_get_all_slot_specs
xclProbe found 0 FPGA slots with xocl driver running
cnt 1
cnt 1
Segmentation fault (core dumped)

The spec of OpenCL says that clRelaseMemObject shouldn't release a memory object which is used:

https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clReleaseMemObject.html

After the memobj reference count becomes zero and commands queued for execution on a command-queue(s) that use memobj have finished, the memory object is deleted.

Is it a bug of the SDAccel platform?

I also tried valgrind and the program accessed the freed memory actually.

$ valgrind ./a.out 
==4930== Memcheck, a memory error detector
==4930== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==4930== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==4930== Command: ./a.out
==4930== 
ERROR: xclProbe-scan failed at fpga_pci_get_all_slot_specs
xclProbe found 0 FPGA slots with xocl driver running
cnt 1
cnt 1
==4930== Thread 5:
==4930== Invalid read of size 8
==4930==    at 0x5ABD7F3: xocl::device::write_buffer(xocl::memory*, unsigned long, unsigned long, void const*) (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x5A936FA: (anonymous namespace)::write_buffer(xocl::event*, xocl::device*, _cl_mem*, unsigned long, unsigned long, void const*) (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x5A98EC0: std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>, std::__future_base::_Result_base::_Deleter>, std::_Bind_simple<std::reference_wrapper<std::_Bind<void (*(xocl::event*, xocl::device*, _cl_mem*, unsigned long, unsigned long, void const*))(xocl::event*, xocl::device*, _cl_mem*, unsigned long, unsigned long, void const*)> > ()>, void> >::_M_invoke(std::_Any_data const&) (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x5A99AB8: std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x617CE6F: pthread_once (in /usr/lib64/libpthread-2.17.so)
==4930==    by 0x5A9B656: std::__future_base::_Task_state<std::_Bind<void (*(xocl::event*, xocl::device*, _cl_mem*, unsigned long, unsigned long, void const*))(xocl::event*, xocl::device*, _cl_mem*, unsigned long, unsigned long, void const*)>, std::allocator<int>, void ()>::_M_run() (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x5AFD2F4: xrt::task::worker2(xrt::task::mpmcqueue<xrt::task::task>&, std::string const&) (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x5AFAACE: std::thread::_State_impl<std::_Bind_simple<void (*(std::reference_wrapper<xrt::task::mpmcqueue<xrt::task::task> >, char const*))(xrt::task::mpmcqueue<xrt::task::task>&, std::string const&)> >::_M_run() (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x5B15A0E: execute_native_thread_routine (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x6177E24: start_thread (in /usr/lib64/libpthread-2.17.so)
==4930==    by 0x513ABAC: clone (in /usr/lib64/libc-2.17.so)
==4930==  Address 0x56b71c0 is 0 bytes inside a block of size 208 free'd
==4930==    at 0x4C2B16D: operator delete(void*) (vg_replace_malloc.c:576)
==4930==    by 0x5A8F89D: clReleaseMemObject (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x400B53: main (main.c:78)
==4930==  Block was alloc'd at
==4930==    at 0x4C2A1E3: operator new(unsigned long) (vg_replace_malloc.c:334)
==4930==    by 0x5A682EB: clCreateBuffer (in /opt/xilinx/xrt/lib/libxilinxopencl.so.2.1.0)
==4930==    by 0x400A87: main (main.c:51)
0 Kudos
3 Replies
kmorris
Xilinx Employee
Xilinx Employee
1,069 Views
Registered: ‎01-11-2011

Hello @kazum, I was able to re-create the output you described and will be looking into this further.

-------------------------------------------------------------------------
Please don’t forget to reply, kudo, and accept as solution!
-------------------------------------------------------------------------
kmorris
Xilinx Employee
Xilinx Employee
944 Views
Registered: ‎01-11-2011

Hi @kazum, after investigation this does appear to be a limitation, and I have filed a request with development to potentially address this issue in a future version of the tools.

A suggested alternative would be to change the host code such that it waits for the enqueue operation to finish prior to releasing the OpenCL objects. As the clEnqueueWriteBuffer in the example is non-blocking, the host code can be changed to wait for the enqueued operation to complete before releasing the object.

-------------------------------------------------------------------------
Please don’t forget to reply, kudo, and accept as solution!
-------------------------------------------------------------------------
kazum
Visitor
Visitor
929 Views
Registered: ‎06-13-2018

@kmorris, thanks for your investigation.  I think of adding clFinish before every clReleaseMemObject call - it looks like the only way to release the memory safely.

0 Kudos