This is more or less Qt's example with some small changes.
The output is PcPcPcPc...etc. I don't understand why.
Namely, I am confused about how sProducer.acquire(256); works. I believe I understand how sProducer.acquire(1); works. It doesn't make sense to me to acquire anything more than 1 because I don't see how acquiring more than 1 makes any difference logically. Could someone explain this? On the surface, writing 1 byte and reading 1 byte doesn't seem very efficient due to semaphore overhead...but acquiring more resources doesn't seem to make a performance difference nor does the code make sense.
Logically I think both the acquire and release have to have the same number (whatever that number is). But how can I modify this code so I can acquire more (say 256) and thus reduce semaphore overhead? The code bellow just doesn't make sense to me when acquire and release is not 1.
#include <QtCore>
#include <iostream>
#include <QTextStream>
//Global variables.
QTextStream out(stdout);
QTextStream in(stdin);
const int DataSize = 1024;
const int BufferSize = 512;
char buffer[BufferSize];
QSemaphore sProducer(BufferSize);
QSemaphore sConsumer(0);
//-----------------------------
class Producer : public QThread
{
public:
void run();
};
void Producer::run()
{
for (int i = 0; i < DataSize; ++i) {
sProducer.acquire(256);
buffer[i % BufferSize] = 'P';
sConsumer.release(256);
}
}
class Consumer : public QThread
{
public:
void run();
};
void Consumer::run()
{
for (int i = 0; i < DataSize; ++i) {
sConsumer.acquire(256);
std::cerr << buffer[i % BufferSize];
out << "c";
out.flush();
sProducer.release(256);
}
std::cerr << std::endl;
}
int main()
{
Producer producer;
Consumer consumer;
producer.start();
consumer.start();
producer.wait();
consumer.wait();
in.readLine(); //so i can read console text.
return 0;
}
Since there is only one producer and one consumer, they can move freely their own private cursor, their i variable, of the amount of bytes they want, as long as there is enough room to do that (something higher that 256 on both sides with a 512 buffer would cause a deadlock).
Basically, when a thread successfully acquire 256 bytes, it means it can safely read or write these 256 bytes in one single operation, so you just have to put another loop inside the acquire/release block to handle that number of bytes.
For the producer:
void Producer::run()
{
for (int i = 0; i < DataSize; ++i) {
const int blockSize = 256;
sProducer.acquire(blockSize);
for(int j = 0; j < blockSize; ++i, ++j) {
buffer[i % BufferSize] = 'P';
}
sConsumer.release(blockSize);
}
}
And for the consumer
void Consumer::run()
{
for (int i = 0; i < DataSize; ++i) {
const int blockSize = 128;
sConsumer.acquire(blockSize);
for(int j = 0; j < blockSize; ++i, ++j) {
std::cerr << buffer[i % BufferSize];
out << "c";
out.flush();
}
sProducer.release(blockSize);
}
std::cerr << std::endl;
}
Related
#include <iostream>
#include <vector>
#include <algorithm>
#include <queue> // std::priority_queue
using std::vector;
using std::cin;
using std::cout;
struct fj{
int indexI=0;
int freeT=0;
};
struct DereferenceCompareNode : public std::binary_function<fj, fj, bool>
{
bool operator()(const fj lhs, const fj rhs) const
{
return lhs.freeT > rhs.freeT;
}
};
class JobQueue {
private:
int num_workers_;
vector<int> jobs_;
vector<int> assigned_workers_;
vector<long long> start_times_;
void WriteResponse() const {
for (int i = 0; i < jobs_.size(); ++i) {
cout << assigned_workers_[i] << " " << start_times_[i] << "\n";
}
}
void ReadData() {
int m;
cin >> num_workers_ >> m;
jobs_.resize(m);
std::cout<<"Read fault"<<"\n";
for(int i = 0; i < m; i++)
cin >> jobs_[i];
std::cout<<"Read fault ends"<<"\n";
}
void AssignJobs() {
// TODO: replace this code with a faster algorithm.
std::cout<<"Fault point 1"<<"\n";
assigned_workers_.resize(jobs_.size());
start_times_.resize(jobs_.size());
vector<long long> next_free_time(num_workers_, 0);
std::priority_queue<int, vector<int>, std::greater<int> > thread;
std::priority_queue<fj, vector<fj>, DereferenceCompareNode > freeJob;
/*
for (int i = 0; i < jobs_.size(); ++i) {
int duration = jobs_[i];
int next_worker = 0;
for (int j = 0; j < num_workers_; ++j) {
if (next_free_time[j] < next_free_time[next_worker])
next_worker = j;
}
assigned_workers_[i] = next_worker;
start_times_[i] = next_free_time[next_worker];
next_free_time[next_worker] += duration;
}
*/
std::cout<<"dump point 2"<<"\n";
for(int i=0;i<num_workers_;i++){
thread.push(i);
}
std::cout<<"dump point 1"<<"\n";
int counter = 0;
while(jobs_.size()!=0){
std::cout<<"jobs_.size:"<<jobs_.size()<<"\n";
std::cout<<"freeJob.size:"<<freeJob.size()<<"\n";
//check logic
do{
if(freeJob.top().freeT == counter){
std::cout<<"freeJob.top().freeT:"<<freeJob.top().freeT<<"\n";
std::cout<<"counter:"<<counter<<"\n";
thread.push(freeJob.top().indexI);
freeJob.pop();
}else{
break;
}
}
while(freeJob.size()!=0);
std::cout<<"Thread:"<<thread.size()<<"\n";
while(thread.size()!=0){
if(jobs_.size()!=0){
fj currA;
currA.indexI = thread.top();
currA.freeT = jobs_.at(0)+counter;
std::cout<<"currA.indexI:"<<currA.indexI<<"\n";
std::cout<<"currA.freeT:"<<currA.freeT<<"\n";
thread.pop();
jobs_.erase(jobs_.begin());
assigned_workers_.push_back(currA.indexI);
start_times_.push_back(currA.freeT);
}else{
break;
}
}
counter++;
}
}
public:
void Solve() {
ReadData();
AssignJobs();
WriteResponse();
}
};
int main() {
std::ios_base::sync_with_stdio(false);
JobQueue job_queue;
job_queue.Solve();
return 0;
}
I am getting segmentation fault in function ReadData while taking inputs for vector jobs.
I am getting fault even when I am inside bounds of defined size.
Everything was fine when have not written AssignJob function.
Am I doing something wrong with some bounds or taking illegal inputs format or messing with some other stuff?
Am I doing something wrong
Yes, you are: freeJob starts out empty, so this is undefined behavior:
if(freeJob.top().freeT == counter){
In fact, you never push anything into freeJob, you only pop() things from it.
The following program:
#include <iostream>
#include <array>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles)
{
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
__global__ void dummy(clock_value_t duration_in_cycles)
{
gpu_sleep(duration_in_cycles);
}
int main()
{
const clock_value_t duration_in_clocks = 1e7;
const size_t buffer_size = 5e7;
constexpr const auto num_streams = 2;
std::array<char*, num_streams> host_ptrs;
std::array<char*, num_streams> device_ptrs;
std::array<cudaStream_t, num_streams> streams;
for (auto i=0; i<num_streams; i++) {
cudaMallocHost(&host_ptrs[i], buffer_size);
cudaMalloc(&device_ptrs[i], buffer_size);
cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
}
cudaDeviceSynchronize();
for (auto i=0; i<num_streams; i++) {
cudaMemcpyAsync(device_ptrs[i], host_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
dummy<<<128, 128, 0, streams[i]>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptrs[i], device_ptrs[i], buffer_size,
cudaMemcpyDefault, streams[i]);
}
for (auto i=0; i<num_streams; i++) { cudaStreamSynchronize(streams[i]); }
for (auto i=0; i<num_streams; i++) {
cudaFreeHost(host_ptrs[i]);
cudaFree(device_ptrs[i]);
}
}
should result in overlapping I/O and Compute between the work on the first and second streams: When the first stream's Host-to-Device ends, the first stream's kernel can start, but so can the second stream's Host-to-Device transfer. Instead, I get the following timeline, with no overlap:
I think I've covered my bases to ensure overlap. The streams are non-blocking (and indeed the enqueueing of work concludes well before the first HtoD does); the host memory is pinned... so what's missing for me to see overlap?
Using CUDA 8.0.61 on GNU/Linux Mint 18.2 with an NVIDIA GTX 650 Ti Boost. But the driver is v384.59.
Ok, it must be something with my GPU model, because with Fedora 25, and a GTX Titan X, I get:
I am learning to use Boost.MPI to parallelize the large amount of computation, here below is just my simple test see if I can get MPI logic correctly. However, I did not get it to work. I used world.size()=10, there are total 50 elements in data array, each process will do 5 iteration. I would hope to update data array by having each process sending the updated data array to root process, and then the root process receives the updated data array then print out. But I only get a few elements updated.
Thanks for helping me.
#include <boost/mpi.hpp>
#include <iostream>
#include <cstdlib>
namespace mpi = boost::mpi;
using namespace std;
#define max_rows 100
int data[max_rows];
int modifyArr(const int index, const int arr[]) {
return arr[index]*2+1;
}
int main(int argc, char* argv[])
{
mpi::environment env(argc, argv);
mpi::communicator world;
int num_rows = 50;
int my_number;
if (world.rank() == 0) {
for ( int i = 0; i < num_rows; i++)
data[i] = i + 1;
}
broadcast(world, data, 0);
for (int i = world.rank(); i < num_rows; i += world.size()) {
my_number = modifyArr(i, data);
data[i] = my_number;
world.send(0, 1, data);
//cout << "i=" << i << " my_number=" << my_number << endl;
if (world.rank() == 0)
for (int j = 1; j < world.size(); j++)
mpi::status s = world.recv(boost::mpi::any_source, 1, data);
}
if (world.rank() == 0) {
for ( int i = 0; i < num_rows; i++)
cout << "i=" << i << " results = " << data[i] << endl;
}
return 0;
}
Your problem is probably here:
mpi::status s = world.recv(boost::mpi::any_source, 1, data);
This is the only way data can get back to the master node.
However, you do not tell the master node where in data to store the answers it is getting. Since data is the address of the array, everything should get stored in the zeroth element.
Interleaving which elements of the array you are processing on each node is a pretty bad idea. You should assign blocks of the array to each node so that you can send entire chunks of the array at once. That will reduce communication overhead significantly.
Also, if your issue is simply speeding up for loops, you should consider OpenMP, which can do things like this:
#pragma omp parallel for
for(int i=0;i<100;i++)
data[i]*=4;
Bam! I just split that for loop up between all of my processes with no further work needed.
I need to perform some regexp operations on binary data. I wrote a function to convert QByteArray data in a hexa string representation. Each byte is prepended by 'x' for parsing purpose.
How could this code be optimized?
QByteArray data;
QByteArray newData;
for (int i = 0; i < data.size(); i++) {
QString hex;
hex.setNum(data[i], 16);
if (data[i] < 10) {
hex.prepend("x0");
} else {
hex.prepend("x");
}
newData.append(hex.toLatin1());
}
The code you posted has two bugs in it that I corrected.
1) Assuming you always want two hex digits you want to check if the value is less than 16, not 10.
2) QString::setNum has no overload for char, so the value is promoted to a larger type. For a value like 128, which is negative in a signed char, you would get x0ffffffffffffff80 due to sign extension.
The function foo1 is your original code with the bugs fixed, and foo2 is a more optimal version that avoids creating a temporary QString since the conversion to unicode and back isn't free, and prepending values to a string requires additional copying.
I used QElapsedTimer because on Windows where I am testing it uses the high resolution PerformanceCounter clock. If you are on another platform it might be less accurate. You can see the different types of clocks it may use in the documentation.
Set display_converted_string to true if you want the converted string printed to verify they are identical.
#include <QString>
#include <QByteArray>
#include <QElapsedTimer>
#include <iostream>
QByteArray foo1(QByteArray data)
{
QByteArray newData;
for (int i = 0; i < data.size(); i++) {
unsigned char c = data[i];
QString hex;
hex.setNum(c, 16);
if (c < 16) {
hex.prepend("x0");
} else {
hex.prepend("x");
}
newData.append(hex.toLatin1());
}
return newData;
}
QByteArray foo2(QByteArray data)
{
static const char digits[] = {'0','1','2','3','4','5','6','7',
'8','9','a','b','c','d','e','f'};
QByteArray newData;
newData.reserve(data.size() * 3);
for (int i = 0; i < data.size(); i++)
{
unsigned char c = data[i];
newData.append('x');
newData.append(digits[(c >> 4) & 0x0f]);
newData.append(digits[c & 0x0f]);
}
return newData;
}
int main()
{
const int iterations = 10000;
const bool display_converted_string = false;
QElapsedTimer t;
std::cout << "Using clock type " << t.clockType() << ".\n";
QByteArray data(256, 0);
QByteArray newData;
qint64 elapsed1 = 0, elapsed2 = 0;
//Set the values in data to 0-255 to make sure all values are converted properly.
for(int i = 0; i < data.size(); ++i)
{
data[i] = i;
}
t.start();
for(int i = 0; i < iterations; ++i)
{
newData = foo1(data);
}
elapsed1 = t.nsecsElapsed();
std::cout << "foo1 elapsed time = " << elapsed1 << "\n";
if(display_converted_string)
{
std::cout << "newData = " << newData.data() << "\n";
}
t.restart();
for(int i = 0; i < iterations; ++i)
{
newData = foo2(data);
}
elapsed2 = t.nsecsElapsed();
std::cout << "foo2 elapsed time = " << elapsed2 << "\n";
if(display_converted_string)
{
std::cout << "newData = " << newData.data() << "\n";
}
return 0;
}
There is a simple program in c++ / mpi (mpich2), which sends an array of type double. If the size of the array more than 9000, then during the call MPI_Send my programm hangs. If array is smaller than 9000 (8000, for example) programm works fine. Source code is bellow:
main.cpp
using namespace std;
Cube** cubes;
int cubesLen;
double* InitVector(int N) {
double* x = new double[N];
for (int i = 0; i < N; i++) {
x[i] = i + 1;
}
return x;
}
void CreateCubes() {
cubes = new Cube*[12];
cubesLen = 12;
for (int i = 0; i < 12; i++) {
cubes[i] = new Cube(9000);
}
}
void SendSimpleData(int size, int rank) {
Cube* cube = cubes[0];
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
double* coefImOut = (double *) malloc(sizeof (double)*cube->coefficentsImLength);
cout << "Before send" << endl;
int count = cube->coefficentsImLength;
MPI_Send(coefImOut, count, MPI_DOUBLE, nodeDest, 0, MPI_COMM_WORLD);
cout << "After send" << endl;
free(coefImOut);
MPI_Status status;
double *coefIm = (double *) malloc(sizeof(double)*count);
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Recv(coefIm, count, MPI_DOUBLE, nodeFrom, 0, MPI_COMM_WORLD, &status);
free(coefIm);
}
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
CreateCubes();
if (rank != root) {
SendSimpleData(size, rank);
}
MPI_Finalize();
return 0;
}
class Cube
class Cube {
public:
Cube(int size);
Cube(const Cube& orig);
virtual ~Cube();
int Id() { return id; }
void Id(int id) { this->id = id; }
int coefficentsImLength;
double* coefficentsIm;
private:
int id;
};
Cube::Cube(int size) {
this->coefficentsImLength = size;
coefficentsIm = new double[size];
for (int i = 0; i < size; i++) {
coefficentsIm[i] = 1;
}
}
Cube::Cube(const Cube& orig) {
}
Cube::~Cube() {
delete[] coefficentsIm;
}
The program runs on 4 processes:
mpiexec -n 4 ./myApp1
Any ideas?
The details of the Cube class aren't relevant here: consider a simpler version
#include <mpi.h>
#include <cstdlib>
using namespace std;
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
int datasize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank != root) {
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Status status;
int *data = new int[datasize];
for (int i=0; i<datasize; i++)
data[i] = rank;
cout << "Before send" << endl;
MPI_Send(&data, datasize, MPI_INT, nodeDest, 0, MPI_COMM_WORLD);
cout << "After send" << endl;
MPI_Recv(&data, datasize, MPI_INT, nodeFrom, 0, MPI_COMM_WORLD, &status);
delete [] data;
}
MPI_Finalize();
return 0;
}
where running gives
$ mpirun -np 4 ./send 1
Before send
After send
Before send
After send
Before send
After send
$ mpirun -np 4 ./send 65000
Before send
Before send
Before send
If in DDT you looked at the message queue window, you'd see everyone is sending, and no one is receiving, and you have a classic deadlock.
MPI_Send's semantics, wierdly, aren't well defined, but it is allowed to block until "the receive has been posted". MPI_Ssend is clearer in this regard; it will always block until the receive has been posted. Details about the different send modes can be seen here.
The reason it worked for smaller messages is an accident of the implementation; for "small enough" messages (for your case, it looks to be <64kB), your MPI_Send implementation uses an "eager send" protocol and doesn't block on the receive; for larger messages, where it isn't necessarily safe just to keep buffered copies of the message kicking around in memory, the Send waits for the matching receive (which it is always allowed to do anyway).
There's a few things you could do to avoid this; all you have to do is make sure not everyone is calling a blocking MPI_Send at the same time. You could (say) have even processors send first, then receive, and odd processors receive first, and then send. You could use nonblocking communications (Isend/Irecv/Waitall). But the simplest solution in this case is to use MPI_Sendrecv, which is a blocking (Send + Recv), rather than a blocking send plus a blocking receive. The send and receive will execute concurrently, and the function will block until both are complete. So this works
#include <mpi.h>
#include <cstdlib>
using namespace std;
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
int datasize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank != root) {
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Status status;
int *outdata = new int[datasize];
int *indata = new int[datasize];
for (int i=0; i<datasize; i++)
outdata[i] = rank;
cout << "Before sendrecv" << endl;
MPI_Sendrecv(outdata, datasize, MPI_INT, nodeDest, 0,
indata, datasize, MPI_INT, nodeFrom, 0, MPI_COMM_WORLD, &status);
cout << "After sendrecv" << endl;
delete [] outdata;
delete [] indata;
}
MPI_Finalize();
return 0;
}
Running gives
$ mpirun -np 4 ./send 65000
Before sendrecv
Before sendrecv
Before sendrecv
After sendrecv
After sendrecv
After sendrecv