Why am I not getting I/O-compute overlap with this code? - asynchronous

The following program:
#include <iostream>
#include <array>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles)
{
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
__global__ void dummy(clock_value_t duration_in_cycles)
{
gpu_sleep(duration_in_cycles);
}
int main()
{
const clock_value_t duration_in_clocks = 1e7;
const size_t buffer_size = 5e7;
constexpr const auto num_streams = 2;
std::array<char*, num_streams> host_ptrs;
std::array<char*, num_streams> device_ptrs;
std::array<cudaStream_t, num_streams> streams;
for (auto i=0; i<num_streams; i++) {
cudaMallocHost(&host_ptrs[i], buffer_size);
cudaMalloc(&device_ptrs[i], buffer_size);
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
}
cudaDeviceSynchronize();
for (auto i=0; i<num_streams; i++) {
cudaMemcpyAsync(device_ptrs[i], host_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
dummy<<<128, 128, 0, streams[i]>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptrs[i], device_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
}
for (auto i=0; i<num_streams; i++) { cudaStreamSynchronize(streams[i]); }
for (auto i=0; i<num_streams; i++) {
cudaFreeHost(host_ptrs[i]);
cudaFree(device_ptrs[i]);
}
}
should result in overlapping I/O and Compute between the work on the first and second streams: When the first stream's Host-to-Device ends, the first stream's kernel can start, but so can the second stream's Host-to-Device transfer. Instead, I get the following timeline, with no overlap:
I think I've covered my bases to ensure overlap. The streams are non-blocking (and indeed the enqueueing of work concludes well before the first HtoD does); the host memory is pinned... so what's missing for me to see overlap?
Using CUDA 8.0.61 on GNU/Linux Mint 18.2 with an NVIDIA GTX 650 Ti Boost. But the driver is v384.59.

Ok, it must be something with my GPU model, because with Fedora 25, and a GTX Titan X, I get:

Related

On running the code in Intel Devcloud, it is throwing runtime error

I was trying to run this code but while compiling in intel devcloud using these commands:
icpx -qopenmp -fopenmp-targets=spir64 openmp_target_offload_clause_ordering.cpp
export OMP_TARGET_OFFLOAD=MANDATORY
it is showing runtime error.
#include <stdio.h>
int main() {
double *V = reinterpret_cast<double*>(0xdeadbeef);
printf("pointer=%p\n", V);
#pragma omp target parallel for simd is_device_ptr(V) if(true)
for(int i = 0; i < 1; ++i) {
printf("pointer=%p\n", V);
}
#pragma omp target parallel for simd if(true) is_device_ptr(V)
for(int i = 0; i < 1; ++i) {
printf("pointer=%p\n", V);
}
return 100;
}
The device pointer you are entering is invalid.
Replace (0xdeadbeef) with (omp_target_alloc(size, 0)) in the first line of your main function as follows:
double ptr = reinterpret_cast<double>(omp_target_alloc(size, 0));
Hope this helps!

QFuture Memoryleak

I want to parallelize a function and have the problem that after a few hours my memory is overloaded.
The test program calculates something simple, and works so far. Only the memory usage is constantly increasing.
QT Project file:
QT -= gui
QT += concurrent widgets
CONFIG += c++11 console
CONFIG -= app_bundle
DEFINES += QT_DEPRECATED_WARNINGS
SOURCES += main.cpp
QT program file:
#include <QCoreApplication>
#include <qdebug.h>
#include <qtconcurrentrun.h>
double parallel_function(int instance){
return (double)(instance)*10.0;
}
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
int nr_of_threads = 8;
double result_sum,temp_var;
for(qint32 i = 0; i<100000000; i++){
QFuture<double> * future = new QFuture<double>[nr_of_threads];
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread] = QtConcurrent::run(parallel_function,thread);
}
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread].waitForFinished();
temp_var = future[thread].result();
qDebug()<<"result: " << temp_var;
result_sum += temp_var;
}
}
qDebug()<<"total: "<<result_sum;
return a.exec();
}
As I have observed, QtConcurrent::run(parallel_function,thread) allocates memory, but does not release memory after future[thread].waitForFinished().
What's wrong here?
You have memory leak because future array is not deleted. Add delete[] future at the end of outer for loop.
for(qint32 i = 0; i<100000000; i++)
{
QFuture<double> * future = new QFuture<double>[nr_of_threads];
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread] = QtConcurrent::run(parallel_function,thread);
}
for(int thread = 0; thread < nr_of_threads; thread++){
future[thread].waitForFinished();
temp_var = future[thread].result();
qDebug()<<"result: " << temp_var;
result_sum += temp_var;
}
delete[] future; // <--
}
Here's how this might look - note how much simpler everything can be! You're dead set on doing manual memory management: why? First of all, QFuture is a value. You can store it very efficiently in any vector container that will manage the memory for you. You can iterate such a container using range-for. Etc.
QT = concurrent # dependencies are automatic, you don't use widgets
CONFIG += c++14 console
CONFIG -= app_bundle
SOURCES = main.cpp
Even though the example is synthetic and the map_function is very simple, it's worth considering how to do things most efficiently and expressively. Your algorithm is a typical map-reduce operation, and blockingMappedReduce has half the overhead of manually doing all of the work.
First of all, let's recast the original problem in C++, instead of some C-with-pluses Frankenstein.
// https://github.com/KubaO/stackoverflown/tree/master/questions/future-ranges-49107082
/* QtConcurrent will include QtCore as well */
#include <QtConcurrent>
#include <algorithm>
#include <iterator>
using result_type = double;
static result_type map_function(int instance){
return instance * result_type(10);
}
static void sum_modifier(result_type &result, result_type value) {
result += value;
}
static result_type sum_function(result_type result, result_type value) {
return result + value;
}
result_type sum_approach1(int const N) {
QVector<QFuture<result_type>> futures(N);
int id = 0;
for (auto &future : futures)
future = QtConcurrent::run(map_function, id++);
return std::accumulate(futures.cbegin(), futures.cend(), result_type{}, sum_function);
}
There is no manual memory management, and no explicit splitting into "threads" - that was pointless, since the concurrent execution platform is aware of how many threads there are. So this is already better!
But this seems quite wasteful: each future internally allocates at least once (!).
Instead of using futures explicitly for each result, we can use the map-reduce framework. To generate the sequence, we can define an iterator that provides the integers we wish to work on. The iterator can be a forward or a bidirectional one, and its implementation is the bare minimum needed by QtConcurrent framework.
#include <iterator>
template <typename tag> class num_iterator : public std::iterator<tag, int, int, const int*, int> {
int num = 0;
using self = num_iterator;
using base = std::iterator<tag, int, int, const int*, int>;
public:
explicit num_iterator(int num = 0) : num(num) {}
self &operator++() { num ++; return *this; }
self &operator--() { num --; return *this; }
self &operator+=(typename base::difference_type d) { num += d; return *this; }
friend typename base::difference_type operator-(self lhs, self rhs) { return lhs.num - rhs.num; }
bool operator==(self o) const { return num == o.num; }
bool operator!=(self o) const { return !(*this == o); }
typename base::reference operator*() const { return num; }
};
using num_f_iterator = num_iterator<std::forward_iterator_tag>;
result_type sum_approach2(int const N) {
auto results = QtConcurrent::blockingMapped<QVector<result_type>>(num_f_iterator{0}, num_f_iterator{N}, map_function);
return std::accumulate(results.cbegin(), results.cend(), result_type{}, sum_function);
}
using num_b_iterator = num_iterator<std::bidirectional_iterator_tag>;
result_type sum_approach3(int const N) {
auto results = QtConcurrent::blockingMapped<QVector<result_type>>(num_b_iterator{0}, num_b_iterator{N}, map_function);
return std::accumulate(results.cbegin(), results.cend(), result_type{}, sum_function);
}
Could we drop the std::accumulate and use blockingMappedReduced instead? Sure:
result_type sum_approach4(int const N) {
return QtConcurrent::blockingMappedReduced(num_b_iterator{0}, num_b_iterator{N},
map_function, sum_modifier);
}
We can also try a random access iterator:
using num_r_iterator = num_iterator<std::random_access_iterator_tag>;
result_type sum_approach5(int const N) {
return QtConcurrent::blockingMappedReduced(num_r_iterator{0}, num_r_iterator{N},
map_function, sum_modifier);
}
Finally, we can switch from using range-generating iterators, to a precomputed range:
#include <numeric>
result_type sum_approach6(int const N) {
QVector<int> sequence(N);
std::iota(sequence.begin(), sequence.end(), 0);
return QtConcurrent::blockingMappedReduced(sequence, map_function, sum_modifier);
}
Of course, our point is to benchmark it all:
template <typename F> void benchmark(F fun, double const N) {
QElapsedTimer timer;
timer.start();
auto result = fun(N);
qDebug() << "sum:" << fixed << result << "took" << timer.elapsed()/N << "ms/item";
}
int main() {
const int N = 1000000;
benchmark(sum_approach1, N);
benchmark(sum_approach2, N);
benchmark(sum_approach3, N);
benchmark(sum_approach4, N);
benchmark(sum_approach5, N);
benchmark(sum_approach6, N);
}
On my system, in release build, the output is:
sum: 4999995000000.000000 took 0.015778 ms/item
sum: 4999995000000.000000 took 0.003631 ms/item
sum: 4999995000000.000000 took 0.003610 ms/item
sum: 4999995000000.000000 took 0.005414 ms/item
sum: 4999995000000.000000 took 0.000011 ms/item
sum: 4999995000000.000000 took 0.000008 ms/item
Note how using map-reduce on a random-iterable sequence has over 3 orders of magnitude lower overhead than using QtConcurrent::run, and is 2 orders of magnitude faster than non-random-iterable solutions.

Mandelbrot in OpenCL

I have this Mandelbrot Kernel written for an OpenCL program. For test I've decided to have all my complex plane on a vector. My problem is when I print the output I obtain a list of 1 (like the initialization of the results array) and not the result of the kernel work.
Where can I have the problem?
#include <iostream>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
int main(){
using namespace std;
int xPixel=100;
int yPixel=100;
float ics[xPixel];
for(int i=0;i<xPixel;++i)
ics[i]=-2+i*((float)4/xPixel);
float ypsilon[yPixel];
for(int i=0;i<yPixel;++i)
ypsilon[i]=-2+i*((float)4/yPixel);
int results[xPixel*yPixel];
for(int i=0;i<xPixel*yPixel;++i)
results[i]=1;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem memX, memY, memOutput;
size_t global;
const char *KernelSource =
"__kernel void mandelbrot(__global float *ics, __global float *ypsilon, __global int *output){\n"\
"size_t id=get_global_id(0);\n"\
"int yPixel=100;\n"\
"for(int i=0;i<yPixel;i++){\n"\
"float x=0;\n"\
"float y=0;\n"\
"int counter=0;\n"\
"while(counter<1000){\n"\
"if(x*x+y*y>2*2){\n"\
"output[(id*yPixel)+i]=counter;\n"\
"break;\n"\
"}\n"\
"float xTemp=x*x-y*y+ics[id];\n"\
"y=2*x*y+ypsilon[i];\n"\
"x=xTemp;\n"\
"counter++;\n"\
"}\n"\
"}\n"\
"}\n";
// retreives a list of platforms available
if (clGetPlatformIDs(1, &platform_id, &num_of_platforms)!= CL_SUCCESS){
cout<<"Unable to get platform_id\n"<<endl;;
return 1;
}
// try to get a supported GPU device
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&num_of_devices) != CL_SUCCESS){
cout<<"Unable to get device_id\n"<<endl;
return 1;
}
//context properties list - nust be terminated with 0
properties[0]=CL_CONTEXT_PLATFORM;
properties[1]=(cl_context_properties)platform_id;
properties[2]=0;
//create a context with the GPU device
context=clCreateContext(properties,1,&device_id,NULL,NULL,&err);
//create a command queue using the context and device
command_queue=clCreateCommandQueue(context,device_id,0,&err);
//create a program from the kernel source code
program=clCreateProgramWithSource(context,1,(const char**)&KernelSource,NULL,&err);
//compile the program
if(clBuildProgram(program,0,NULL,NULL,NULL,NULL)!=CL_SUCCESS){
cout<<"Error building program"<<endl;
return 1;
}
//specify which kernel from the program to execute
kernel=clCreateKernel(program,"mandelbrot",&err);
//create buffers for input and output
memX=clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*xPixel,NULL,NULL);
memY=clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*yPixel,NULL,NULL);
memOutput=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(int)*(xPixel*yPixel),NULL,NULL);
//load data into the input buffer
clEnqueueWriteBuffer(command_queue,memX,CL_TRUE,0,sizeof(float)*xPixel,ics,0,NULL,NULL);
clEnqueueWriteBuffer(command_queue,memY,CL_TRUE,0,sizeof(float)*yPixel,ypsilon,0,NULL,NULL);
//set the argument list for the kernel command
clSetKernelArg(kernel,0,sizeof(cl_mem),&memX);
clSetKernelArg(kernel,1,sizeof(cl_mem),&memY);
clSetKernelArg(kernel,2,sizeof(cl_mem),&memOutput);
global=xPixel*yPixel;
//enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue,kernel,1,NULL,&global,NULL,0,NULL,NULL);
clFinish(command_queue);
//copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue,memOutput,CL_TRUE,0,sizeof(int)*(xPixel*yPixel),results,0,NULL,NULL);
//print output
for(int i=0;i<xPixel;++i){
for(int j=0;j<yPixel;++j){
cout<<results[(i*yPixel)+j]<<" ";
}
cout<<endl;
}
//cleanup - release OpenCL resources
clReleaseMemObject(memX);
clReleaseMemObject(memY);
clReleaseMemObject(memOutput);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
}
I'm not seeing the exact reason, but I do have a question: If you're running this on every element then what is the "i" looping over "yPixel" for? It seems like you're doing X*Y*Y work instead of X*Y work (your global size is X*Y then the kernel loops on Y again).
If you add "output[(id*yPixel)+i]=42" before the "i" loop then what does your output buffer hold? That will tell you if the problem lies in your kernel or your host code.
To help anyone else looking at this, I've reformatted the kernel code:
__kernel void mandelbrot(__global float *ics, __global float *ypsilon, __global int *output)
{
size_t id=get_global_id(0);
int yPixel=100;
for(int i=0;i<yPixel;i++)
{
float x=0;
float y=0;
int counter=0;
while(counter<1000)
{
if(x*x+y*y>2*2)
{
output[(id*yPixel)+i]=counter;
break;
}
float xTemp=x*x-y*y+ics[id];
y=2*x*y+ypsilon[i];
x=xTemp;
counter++;
}
}
}

clBuildProgram yields AccessViolationException when building this specific kernel

This is a part of some sort of parallel reduction/extremum kernel. I have reduced it to the minimum code that still gets clBuildProgram crashing (note that it really crashes, and doesn't just return an error code):
EDIT: It seems like this also happens when local_value is declared global instead of local.
EDIT2 / SOLUTION: The problem was that there was an infinite loop. I should have written remaining_items >>= 1 instead of remaining_items >> 1. As has been said in the answers, the nvidia compiler seems not very robust when it comes to compile/optimization errors.
kernel void testkernel(local float *local_value)
{
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1)
{
// throw away the right half of the threads
remaining_items >> 1; // <-- SPOTTED THE BUG
if (thread_id > remaining_items)
{
return;
}
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
{
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Removing the lines return; and/or local_value[thread_id] = right_value; causes clBuildProgram to finish successfully.
I can reproduce this problem on all of my computers (NVIDIA GTX 560, GT 555M, GT 540M, they're all Fermi 2.1 architecture). It's apparent on the NVIDIA CUDA Toolkit SDK versions 4.0, 4.1 and 4.2, when using either x64 or x86 libraries.
Does anyone have an idea what could be the problem?
Is it possible, that local (aka shared) memory is automatically assumed to be (WORK_GROUP_SIZE) * siezof(its_base_type)? That would explain why it works when the lines I mentioned above are removed.
Minimal host code (C99 compatible) for reproduction:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define RETURN_THROW(expression) do { cl_int ret = expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
#define REF_THROW(expression) do { cl_int ret; expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
int main(int argc, char **argv)
{
// Load the kernel source code into the array source_str
FILE *fp;
fp = fopen("testkernel.cl", "rb");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
int filesize = ftell(fp);
rewind(fp);
char *source_str = (char*)calloc(filesize, sizeof(char));
size_t bytes_read = fread(source_str, 1, filesize, fp);
source_str[bytes_read] = 0;
fclose(fp);
// Get platform information
cl_uint num_platforms;
RETURN_THROW(clGetPlatformIDs(0, NULL, &num_platforms));
cl_platform_id *platform_ids = (cl_platform_id *)calloc(num_platforms, sizeof(cl_platform_id));
RETURN_THROW(clGetPlatformIDs(num_platforms, platform_ids, NULL));
cl_device_id selected_device_id = NULL;
printf("available platforms:\n");
for (cl_uint i = 0; i < num_platforms; i++)
{
char platform_name[50];
RETURN_THROW(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 50, platform_name, NULL));
printf("%s\n", platform_name);
// get devices for this platform
cl_uint num_devices;
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));
cl_device_id *device_ids = (cl_device_id *)calloc(num_devices, sizeof(cl_device_id));
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, num_devices, device_ids, NULL));
// select first nvidia device
if (strstr(platform_name, "NVIDIA")) // ADAPT THIS ACCORDINGLY
{
selected_device_id = device_ids[0];
}
}
if (selected_device_id == NULL)
{
printf("No NVIDIA device found\n");
exit(1);
}
// Create an OpenCL context
cl_context context;
REF_THROW(context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret));
// Create a program from the kernel source
cl_program program;
REF_THROW(program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &ret));
// Build the program
cl_int ret = clBuildProgram(program, 1, &selected_device_id, NULL, NULL, NULL);
if (ret)
{
printf("BUILD ERROR\n");
// build error - get build log and display it
size_t build_log_size;
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
char *build_log = new char[build_log_size];
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
printf("%s\n", build_log);
exit(1);
}
printf("build finished successfully\n");
return 0;
}
In my experience the nvidia compiler isn't very robust when it comes to handling build errors, so you probably have a compile error somewhere.
I think your problem is indeed the return, or more to the point its combination with barrier. According to the opencl spec about barriers:
All work-items in a work-group executing the kernel on a processor
must execute this function before any are allowed to continue
execution beyond the barrier. This function must be encountered by all
work-items in a work-group executing the kernel.
If barrier is inside a conditional statement, then all work-items must enter the
onditional if any work-item enters the conditional statement and
executes the barrier.
If barrer is inside a loop, all work-items
must execute the barrier for each iteration of the loop before any are
allowed to continue execution beyond the barrier.
So I think your problem is probably that a lot of threads would return before getting to the barrier, making this code invalid. Maybe you should try something like this:
kernel void testkernel(local float *local_value) {
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1) {
remaining_items >>= 1;// throw away the right half of the threads
if (thread_id <= remaining_items) {
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
Edit: Furthermore as noted in the comments it needs to be remaining_items>>=1 instead of remaining_items>>1 in order to avoid producing an infinite loop.

QSemaphore producer consumer problem

This is more or less Qt's example with some small changes.
The output is PcPcPcPc...etc. I don't understand why.
Namely, I am confused about how sProducer.acquire(256); works. I believe I understand how sProducer.acquire(1); works. It doesn't make sense to me to acquire anything more than 1 because I don't see how acquiring more than 1 makes any difference logically. Could someone explain this? On the surface, writing 1 byte and reading 1 byte doesn't seem very efficient due to semaphore overhead...but acquiring more resources doesn't seem to make a performance difference nor does the code make sense.
Logically I think both the acquire and release have to have the same number (whatever that number is). But how can I modify this code so I can acquire more (say 256) and thus reduce semaphore overhead? The code bellow just doesn't make sense to me when acquire and release is not 1.
#include <QtCore>
#include <iostream>
#include <QTextStream>
//Global variables.
QTextStream out(stdout);
QTextStream in(stdin);
const int DataSize = 1024;
const int BufferSize = 512;
char buffer[BufferSize];
QSemaphore sProducer(BufferSize);
QSemaphore sConsumer(0);
//-----------------------------
class Producer : public QThread
{
public:
void run();
};
void Producer::run()
{
for (int i = 0; i < DataSize; ++i) {
sProducer.acquire(256);
buffer[i % BufferSize] = 'P';
sConsumer.release(256);
}
}
class Consumer : public QThread
{
public:
void run();
};
void Consumer::run()
{
for (int i = 0; i < DataSize; ++i) {
sConsumer.acquire(256);
std::cerr << buffer[i % BufferSize];
out << "c";
out.flush();
sProducer.release(256);
}
std::cerr << std::endl;
}
int main()
{
Producer producer;
Consumer consumer;
producer.start();
consumer.start();
producer.wait();
consumer.wait();
in.readLine(); //so i can read console text.
return 0;
}
Since there is only one producer and one consumer, they can move freely their own private cursor, their i variable, of the amount of bytes they want, as long as there is enough room to do that (something higher that 256 on both sides with a 512 buffer would cause a deadlock).
Basically, when a thread successfully acquire 256 bytes, it means it can safely read or write these 256 bytes in one single operation, so you just have to put another loop inside the acquire/release block to handle that number of bytes.
For the producer:
void Producer::run()
{
for (int i = 0; i < DataSize; ++i) {
const int blockSize = 256;
sProducer.acquire(blockSize);
for(int j = 0; j < blockSize; ++i, ++j) {
buffer[i % BufferSize] = 'P';
}
sConsumer.release(blockSize);
}
}
And for the consumer
void Consumer::run()
{
for (int i = 0; i < DataSize; ++i) {
const int blockSize = 128;
sConsumer.acquire(blockSize);
for(int j = 0; j < blockSize; ++i, ++j) {
std::cerr << buffer[i % BufferSize];
out << "c";
out.flush();
}
sProducer.release(blockSize);
}
std::cerr << std::endl;
}

Resources