I have a weird situation where I'm trying to allocate 128k worth of local storage for a C66x kernel (16k floats - 8k complex elements). When I do so, I can run about 10-20 iterations of the kernel and at a certain point, OpenCL will stop responding to queue requests. If I run the kernel multiple times subsequently, the number of iterations will vary.
If I drop the kernel requirement to 64k (8k floats, 4k complex elements), then I can run as many iterations of the kernel as I care to.
Note that the kernel itself is simply a call to DSPF_sp_fftSPxSP. If I modify the kernel to be a simpler task (such as for-loop copying the data from a DDR buffer into the L2 buffer), then no freeze occurs.
Note also, that the DSPF_sp_fftSPxSP kernel will run fine if I modify the local storage to use DDR - but that's a lot slower.
Is there something specific about the DSPF_sp_fftSPxSP library that randomly requires more space than the leftover scratchpad? It's weird that I can run a few of the kernels before a failure.
Anything I can do to be able to do a 8k point single precision FFT leveraging all 128k of available L2 consistently?
I've pasted my main() here and attached a zip of the full source.
int main(int argc, char *argv[]) { unsigned int result; int block_count = 0; try { /* Setup OpenCL */ Context context(CL_DEVICE_TYPE_ACCELERATOR); std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); int numDevices = devices.size(); CommandQueue* Q[numDevices]; std::string device_name; for (int d = 0; d < numDevices; d++) { devices[d].getInfo(CL_DEVICE_NAME, &device_name); cout << device_name << endl; Q[d] = new CommandQueue(context, devices[d], CL_QUEUE_PROFILING_ENABLE); } printf("Loading C66x binary\n"); char *c66_binary; int binary_length = ocl_read_binary("kernel-test-l2.out", c66_binary); Program::Binaries program_binary(1, std::make_pair(c66_binary, binary_length)); Program program = Program(context, devices, program_binary); program.build(devices); /* Get the kernel for spectral window */ printf("Setting up fft_test\n"); Kernel fft_test(program, "fft_buffer_test"); /* Allocate initial space for our input data */ float *fft_t_buffer = (float *)memalign(4, sizeof(float) * FFT_SIZE); float *twiddle = (float *)memalign(4, sizeof(float) * FFT_SIZE); float *window = (float *)memalign(4, sizeof(float) * WINDOW_SIZE); tw_gen(twiddle, WINDOW_SIZE); Buffer fft_time_buf(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * FFT_SIZE, fft_t_buffer); Buffer fft_freq_buf(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * FFT_SIZE, fft_t_buffer); Buffer tw(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * FFT_SIZE, twiddle); Buffer wnd(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * FFT_SIZE, window); /* When we're mapping buffers, we can't use the old pointers any more. Make some new pointers */ float *fft_write_buffer = NULL; Event fft_process_event; Event unmap_event; while (true) { block_count++; printf("block %d\n", block_count); /* map the buffer so we can start writing to it */ fft_write_buffer = (float *) Q[0]->enqueueMapBuffer(fft_time_buf, CL_TRUE, CL_MAP_WRITE, 0, FFT_SIZE * sizeof(float)); /* Generate random signal */ for (int frame = 0; frame < WINDOW_SIZE; ++frame) { fft_write_buffer[frame * 2] = -1 + (2.0 * ((float)rand()) / RAND_MAX); /* No imaginary data */ fft_write_buffer[(frame * 2) + 1] = 0; } /* Unmap the memory area so dsp can use it now */ if (result = Q[0]->enqueueUnmapMemObject(fft_time_buf, fft_write_buffer, NULL, &unmap_event) != CL_SUCCESS) { printf("Error enqueing unmap memory"); return result; } /* Kick off the DSP work */ fft_test.setArg(0, FFT_SIZE); fft_test.setArg(1, fft_time_buf); fft_test.setArg(2, __local(FFT_SIZE * sizeof(float))); fft_test.setArg(3, tw); fft_test.setArg(4, (FFT_SIZE % 4 == 0) ? 4 : 2); /* Make sure unmapping is done before we continue */ unmap_event.wait(); /* Send the task to the DSP */ if (result = Q[0]->enqueueTask(fft_test, NULL, &fft_process_event) != CL_SUCCESS) { printf("error enqueing fft task\n"); return result; } fft_process_event.wait(); ocl_event_times(fft_process_event, "FFT"); } } catch (Error err) { cerr << "ERROR: " << err.what() << "(" << err.err() << ", " << ocl_decode_error(err.err()) << ")" << endl; } return 0; }
/cfs-file/__key/communityserver-discussions-components-files/791/dsp_2D00_buffer_2D00_test.zip